#Authors: [INSERT NAMES HERE]
#Author Date:
#Purpose: The purpose of this notebook is to house all data set transformation, cleansing, visualization, statistical analysis, and note-taking for the 2025 CU Athletic Department Sports Science Internship Program
#LAST UPDATED:
#Including helpful libraries
library(tidyverse)
library(readxl)
library(aod)
library(gt)
library(boot)
library(mgcv)
library(lme4)
library(leaps)
#loading in the Incident Report to look at HSIs
Incident_Report <- read_csv("data-sets/data-sets-uncompressed/data-sets-compressed/Running Imbalance and Speed/Incident Report FB IDs.csv")
New names:
• `` -> `...1`
Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)
Rows: 2220 Columns: 225
── Column specification ─────────────────────────────────────────────────────────────────
Delimiter: ","
chr (163): anon_id, Date, Sport, Position, Medical.Alerts, View.Medical.Alerts, Inci...
dbl (31): ...1, Days.to.Exam, Days.in.Status, Games.Missed, Injury.to.Exam.Duration...
lgl (28): MRN, Converted.to.injury.from.maintenance.record, What.sport.is.this.inci...
time (3): Time.of.Injury, Time.Entered.On, Time.Stamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Catapult_Session_clean <- Catapult_Session %>%
#putting the date as a date class
mutate(Date = as.Date(Date, "%m/%d/%Y")) %>%
#only selecting important columns for this analysis
select(anon_id, Date, Age, Primary.Position, Total.Distance, Period.Name, Total.Duration..min., Velocity.Band.1.Total.Distance, Velocity.Band.2.Total.Distance, Velocity.Band.3.Total.Distance, Velocity.Band.4.Total.Distance, Velocity.Band.5.Total.Distance, Velocity.Band.6.Total.Distance, Velocity.Band.7.Total.Distance, Velocity.Band.8.Total.Distance, Velocity.Band.2.Total.Effort.Count, Velocity.Band.3.Total.Effort.Count, Velocity.Band.4.Total.Effort.Count, Velocity.Band.5.Total.Effort.Count, Velocity.Band.6.Total.Effort.Count, Velocity.Band.7.Total.Effort.Count, Velocity.Band.8.Total.Effort.Count, Maximum.Velocity, Average.Velocity, Hit.90.Percent.Max, Date.of.Last.90.Effort, Days.Since.Last.90.Effort, Hit.Max.Velocity., Date.of.All.Time.Max.Velocity, Days.Since.Max.Velocity, Session.Max.Velocity) %>%
#calculating each player's maximum velocity
group_by(anon_id) %>%
mutate(Player.Max.Velocity = max(na.omit(Maximum.Velocity))) %>%
ungroup() %>%
#only selecting data from January 1, 2024 and on
filter(Date >= "2024-01-01")
head(Catapult_Session_clean)
Historical_Running_clean <- Historical_Running %>%
#taking out rows that don't have data
filter(Running.Imbalance != "n/a") %>%
#putting running imbalance as a number and converting the date to a date class
mutate(Running.Imbalance = as.numeric(Running.Imbalance),
Date = as.Date(Date, "%m/%d/%Y")) %>%
#only using data from January 1, 2024 and on
filter(Date >= "2024-01-01") %>%
mutate(X=1:4063) %>%
#making days since January 1, 2024 for each player
group_by(anon_id) %>%
mutate(Days.Since.Start = as.numeric(Date - min(Date))) %>%
ungroup()
head(Historical_Running_clean)
Incident_Report_clean <- Incident_Report %>%
#filtering for only hamstring injuries
filter(OSICS14.Code == "TM1",
Status != "Full Go") %>%
#getting the date of the injury as a date class
mutate(Date = as.Date(Date, "%m/%d/%Y"),
Date.of.Injury = as.Date(Date.of.Injury...Onset.of.symptoms, "%m/%d/%Y"),
Examination.Date = as.Date(Examination.Date, "%m/%d/%Y")) %>%
#only selecting relevant columns for this analysis
select(anon_id, Position, Date, Date.of.Injury, Time.of.Injury, Side, OSICS.Injury.Diagnosis, Coach.s.Diagnosis, Recurrence.of.Injury, Choose.Season, Onset.of.Symptoms, Injury.Prognosis, General.Mechanism, Specific.Mechanism, Injured.While., Type.of.Event, Season., Status, Days.in.Status) %>%
#making days out due to injury for each player and each injury they sustained
group_by(anon_id, Date.of.Injury) %>%
mutate(Days.Out = sum(Days.in.Status)) %>%
ungroup()
head(Incident_Report_clean)
#taking the IDs of players who are and aren't injured
all_IDs <- unique(Historical_Running_clean$anon_id)
#taking IDs that were injured and also have running imbalance data
injured_IDs <- intersect(unique(Incident_Report_clean$anon_id), all_IDs)
#taking all players with running imbalance data that don't have an injury
uninjured_IDs <- unique((Historical_Running_clean %>%
filter(!anon_id %in% injured_IDs))$anon_id)
#injured players who also have running imbalance data
Incident_Report_clean <- Incident_Report_clean %>%
filter(anon_id %in% injured_IDs)
#all players that only have running imbalance data or have both running imbalance data and incidence report
Historical_Running_clean <- Historical_Running_clean %>%
filter(anon_id %in% injured_IDs | anon_id %in% uninjured_IDs)
#removing uncleaned data sets
remove(Incident_Report)
remove(Historical_Running)
remove(Catapult_Session)
# Bar chart for how often players reach ≥ 90% maximum velocity
# Count for how many times each anon_id hit ≥ 90% maximum velocity
hit_90_counts <- Catapult_Session_clean %>%
filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>% # Filter for training season
filter(Hit.90.Percent.Max == "Yes") %>%
distinct(anon_id,Date) %>%
group_by(anon_id) %>%
summarise(times_hit_90 = n())
# Plot of all players' frequencies
ggplot(hit_90_counts, aes(x = anon_id, y = times_hit_90)) +
geom_bar(stat = "identity", fill = "#CFB87C") +
geom_hline(yintercept = mean(hit_90_counts$times_hit_90),
linetype = "dashed", color = "#565A5C", linewidth = 0.5) +
labs(title = "Player Counts for Achieving ≥ 90% of Maximum Velocity During 2024–25 Season",
subtitle = paste("Team Average:", round(mean(hit_90_counts$times_hit_90), 1)),
x = "Athlete ID", y = "Times ≥ 90%") +
theme_classic() +
theme(axis.text.x = element_text(size = 6, angle = 90))
hit_90_counts <- Catapult_Session_clean %>%
filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01"),
Hit.90.Percent.Max == "Yes") %>%
distinct(anon_id, Date, .keep_all = TRUE) %>% # Keep position data
group_by(anon_id, Primary.Position) %>% # Group by both ID and position
summarise(times_hit_90 = n(), .groups = "drop")
QBs <- hit_90_counts %>%
filter(Primary.Position == "QB")
# Calculate the averages first
overall_avg <- mean(hit_90_counts$times_hit_90)
qb_avg <- mean(QBs$times_hit_90)
ggplot(QBs, aes(x=anon_id, y=times_hit_90)) +
geom_bar(stat="identity", fill = "#CFB87C") +
geom_text(aes(label = times_hit_90),
vjust = -0.5,
size = 3.5) +
geom_hline(yintercept = overall_avg,
linetype = "dashed", color = "#000000") +
geom_hline(yintercept = qb_avg,
linetype = "dashed", color = "#565A5C") +
annotate("text", x = 1, y = overall_avg + 0.5,
label = "Team Avg", color = "#000000", size = 3) +
annotate("text", x = 1, y = qb_avg + 0.5,
label = "QB Avg", color = "#565A5C", size = 3) +
labs(title = "QB counts for reaching > 90% max velocity",
subtitle = paste("QB Average:", qb_avg)) +
theme_classic()
# Facet Plot
# Recalculate hit_90_counts for all positions
hit_90_counts <- Catapult_Session_clean %>%
filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01"),
Hit.90.Percent.Max == "Yes") %>%
distinct(anon_id, Date, .keep_all = TRUE) %>% # Keep position data
group_by(anon_id, Primary.Position) %>% # Group by both ID and position
summarise(times_hit_90 = n(), .groups = "drop")
# Calculate overall average
overall_avg <- mean(hit_90_counts$times_hit_90)
# Plot faceted bar charts by position
ggplot(hit_90_counts, aes(x = anon_id, y = times_hit_90)) +
geom_bar(stat = "identity", fill = "#CFB87C") +
geom_text(aes(label = times_hit_90), vjust = -0.5, size = 3.5) +
geom_hline(yintercept = overall_avg, linetype = "dashed", color = "#000000") +
annotate("text", x = 1, y = overall_avg + 0.5,
label = "Team Avg", color = "#000000", size = 3, hjust = 0) +
facet_wrap(~ Primary.Position, scales = "free_x") +
labs(title = "Counts of >90% Max Velocity by Player and Position",
y = "Times > 90% Max Velocity",
x = "Player (anon_id)") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Plots for each position
hit_90_counts <- Catapult_Session_clean %>%
filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01"),
Hit.90.Percent.Max == "Yes") %>%
distinct(anon_id, Date, .keep_all = TRUE) %>%
group_by(anon_id, Primary.Position) %>%
summarise(times_hit_90 = n(), .groups = "drop")
# Get the overall team average once from hit_90_counts
team_avg <- mean(hit_90_counts$times_hit_90)
# Define positions to loop through
positions <- unique(hit_90_counts$Primary.Position)
# Plotting function:
plot_hit_90_by_position <- function(pos) {
position_data <- hit_90_counts %>%
filter(Primary.Position == pos)
pos_avg <- mean(position_data$times_hit_90)
ggplot(position_data, aes(x = anon_id, y = times_hit_90)) +
geom_bar(stat = "identity", fill = "#CFB87C") +
geom_text(aes(label = times_hit_90), vjust = -0.5, size = 3.5) +
geom_hline(yintercept = pos_avg, linetype = "dashed", color = "#565A5C") +
annotate("text", x = 1, y = pos_avg + 0.5,
label = "Pos Avg", color = "#565A5C", size = 3, hjust = 0) +
geom_hline(yintercept = team_avg, linetype = "dashed", color = "#000000") +
annotate("text", x = 1, y = team_avg + 0.5,
label = "Team Avg", color = "#000000", size = 3, hjust = 0) +
labs(title = paste("Times ≥90% Max Velocity –", pos),
subtitle = paste0("Position Avg: ", round(pos_avg, 1),
" | Team Avg: ", round(team_avg, 1)),
y = "Count",
x = "anon_id") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
# Loop through each position and print the plot
plots_by_position <- lapply(positions, function(pos) {
print(plot_hit_90_by_position(pos))
})
# Table for average values of each position
position_averages <- hit_90_counts %>%
group_by(Primary.Position) %>%
summarise(avg_times_hit_90 = mean(times_hit_90), .groups = "drop") %>%
arrange(desc(avg_times_hit_90))
position_averages_with_team <- bind_rows(
tibble(
Primary.Position = "Team Average",
avg_times_hit_90 = team_avg
),
position_averages
)
position_averages_with_team %>%
gt() %>%
tab_header(
title = "Average Times >90% Max Velocity by Position and Team"
)
| Average Times >90% Max Velocity by Position and Team | |
| Primary.Position | avg_times_hit_90 |
|---|---|
| Team Average | 11.768421 |
| DB | 18.352941 |
| WR | 15.666667 |
| DL | 10.789474 |
| RB | 10.500000 |
| QB | 9.750000 |
| LB | 8.916667 |
| TE | 8.400000 |
| OL | 6.764706 |
# >90% counts over the course of the season
# Trying to find out when are players reaching above 90% the most
# Create dataset that has the count of >90% of each day
daily_90_counts <- Catapult_Session_clean %>%
filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>%
filter(Hit.90.Percent.Max == "Yes") %>%
distinct(anon_id, Date) %>%
group_by(Date) %>%
summarise(daily_hits = n())
# Plot
ggplot(daily_90_counts, aes(x = Date, y = daily_hits)) +
geom_line(color = "#CFB87C", linewidth = 1) +
geom_smooth(method = "loess", se = FALSE, color = "#565A5C", linetype = "dashed") +
labs(title = "Daily Count of Players Reaching ≥ 90% of Max Velocity",
subtitle = "Over the 2024–25 Training Season",
x = "Date", y = "Times reached ≥ 90%") +
theme_classic()
`geom_smooth()` using formula = 'y ~ x'
Data is noisy, so lets try grouping by week instead of every day.
# >90% counts for each week instead of each day. Will be a little less noisy than the daily
weekly_90_counts <- Catapult_Session_clean %>%
filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>%
filter(Hit.90.Percent.Max == "Yes") %>%
distinct(anon_id, Date) %>%
mutate(week = floor_date(Date, unit = "week")) %>%
group_by(week) %>%
summarise(weekly_hits = n())
ggplot(weekly_90_counts, aes(x = week, y = weekly_hits)) +
geom_line(color = "#CFB87C", linewidth = 1) +
geom_point(color = "#565A5C") +
labs(title = "Weekly Count of Players Reaching ≥ 90% of Max Velocity",
x = "Week", y = "Times reached ≥ 90%") +
theme_classic()
# Bar chart of weekly >90% counts
ggplot(weekly_90_counts, aes(x = week, y = weekly_hits)) +
geom_col(fill = "#CFB87C", color = "#565A5C", width = 7) +
geom_text(aes(label = weekly_hits),
vjust = -0.4,
size = 2.5,
color = "black") +
geom_hline(yintercept = mean(weekly_90_counts$weekly_hits),
linetype = "dashed", color = "#565A5C", linewidth = 0.5) +
labs(
title = "Weekly Count of Players Reaching ≥ 90% of Max Velocity",
subtitle = paste("Season Average per Week:", round(mean(weekly_90_counts$weekly_hits), 1)),
x = "Week",
y = "Times reached ≥ 90%"
) +
scale_x_date(date_breaks = "2 weeks", date_labels = "%b %d") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Create weekly sprint count totals
# Create a week variable
Catapult_Session_clean <- Catapult_Session_clean %>%
mutate(week = floor_date(Date, unit = "week"))
# Get unique athletes and all weeks in your date range
all_athletes <- unique(Catapult_Session_clean$anon_id)
all_weeks <- seq.Date(as.Date("2024-06-30"), as.Date("2025-07-01"), by = "week")
# Create a full grid of anon_id and weeks (this includes weeks with 0 sprints)
full_grid <- expand_grid(anon_id = all_athletes, week = all_weeks)
# Count sprints per athlete-week
sprint_counts <- Catapult_Session_clean %>%
filter(Hit.90.Percent.Max == "Yes") %>%
distinct(anon_id, Date, .keep_all = TRUE) %>%
group_by(anon_id, week) %>%
summarise(sprint_count = n(), .groups = "drop")
# Join full grid with actual sprint counts and replace NAs with 0s
weekly_sprint_counts <- full_grid %>%
left_join(sprint_counts, by = c("anon_id", "week")) %>%
mutate(sprint_count = replace_na(sprint_count, 0))
# Categorize sprint load groups
weekly_sprint_counts <- weekly_sprint_counts %>%
mutate(sprint_load_group = case_when(
sprint_count <= 1 ~ "Low (≤1 sprint)",
sprint_count %in% 2:3 ~ "Moderate (2–3 sprints)",
sprint_count %in% 4:5 ~ "High (4–5 sprints)"
))
# Merge weekly sprints and injury data frames
# Create an injury weeks data frame from incident report
# This should indicate which week the injury occurred
injury_weeks <- Incident_Report_clean %>%
mutate(
week = floor_date(as.Date(Date.of.Injury), unit = "week")
) %>%
filter(Date.of.Injury >= as.Date("2024-06-30") & Date.of.Injury <= as.Date("2025-07-01")) %>%
select(anon_id, week) %>%
distinct() %>%
mutate(injury = 1)
# Merge with weekly sprints data frame
weekly_sprint_counts <- weekly_sprint_counts %>%
left_join(injury_weeks, by = c("anon_id", "week")) %>%
mutate(injury = ifelse(is.na(injury), 0, injury)) # Fill NAs with 0 (no injury)
# Table to compare injury weeks to non injury weeks
weekly_sprint_counts %>%
group_by(injury) %>%
summarise(
mean_sprints = mean(sprint_count, na.rm = TRUE),
sd_sprints = sd(sprint_count, na.rm = TRUE),
n = n()
)
ggplot(weekly_sprint_counts, aes(x = week, y = sprint_count, color = factor(injury))) +
geom_line(aes(group = anon_id), alpha = 0.5) +
geom_point(data = subset(weekly_sprint_counts, injury == 1), size = 2, shape = 21, fill = "red") +
scale_color_manual(values = c("0" = "grey", "1" = "red"), labels = c("No Injury", "Injury")) +
labs(title = "Weekly Sprint Counts with Injury Events", x = "Week", y = "Sprint Count", color = "Injury") +
theme_minimal()
No injuries occurred when an athlete had 4 or 5 sprint counts within a week, however there are limited data points in these sprint counts. There are also only three injuries that occur in the moderate range of 2-3 sprint counts The most injuries occur in the low range of 0-1 sprints.
This suggests that underexposure to sprinting may increase injury risk.
injury_by_group <- weekly_sprint_counts %>%
group_by(sprint_load_group) %>%
summarise(
total_weeks = n(),
injuries = sum(injury),
injury_rate = injuries / total_weeks
)
injury_by_group
The moderate group (2-3 sprints) has the highest injury rate (1.18%). Low sprints still had injuries, but at at lower rate (0.36%). High sprints had zero injuries, but the sample size is very small so we cannot draw a reliable conclusion from this group.
# Build contingency table
sprint_injury_table <- matrix(
c(0, 13, 3, 254, 20, 6093),
nrow = 3,
byrow = TRUE,
dimnames = list(
sprint_group = c("High", "Moderate", "Low"),
injury = c("Injury", "No Injury")
)
)
# Run Fisher's Exact Test
fisher.test(sprint_injury_table)
Fisher's Exact Test for Count Data
data: sprint_injury_table
p-value = 0.1064
alternative hypothesis: two.sided
Null hypothesis: Injury occurrence is independent of sprint load group. Alternative: Injury occurrence is associated with sprint load group.
Because the p-value is > 0.05, we fail to reject the null. there is no strong evidence of a significant association between sprinting effort group and injury occurrence based the current data.
Now we will explore the previous week’s sprint count.
# Create data set to explore the sprint count for the week before
# Shift sprint counts by 1 week for each athlete
weekly_sprint_counts_lagged <- weekly_sprint_counts %>%
arrange(anon_id, week) %>%
group_by(anon_id) %>%
mutate(
sprint_count_lag1 = lag(sprint_count),
sprint_load_group_lag1 = lag(sprint_load_group)
) %>%
ungroup()
# Create injury weeks again (same as before)
injury_weeks <- Incident_Report_clean %>%
mutate(
week = floor_date(as.Date(Date.of.Injury), unit = "week")
) %>%
filter(Date.of.Injury >= as.Date("2024-06-30") & Date.of.Injury <= as.Date("2025-07-01")) %>%
select(anon_id, week) %>%
distinct() %>%
mutate(injury = 1)
# Merge with sprint data using CURRENT week for injury
sprint_prior_to_injury <- weekly_sprint_counts_lagged %>%
left_join(injury_weeks, by = c("anon_id", "week")) %>%
mutate(injury = replace_na(injury, 0)) # Fill missing with 0s
Error in `mutate()`:
ℹ In argument: `injury = replace_na(injury, 0)`.
Caused by error:
! object 'injury' not found
Run `]8;;x-r-run:rlang::last_trace()rlang::last_trace()]8;;` to see where the error occurred.
# Join to add week number per athlete
weekly_sprint_counts <- weekly_sprint_counts %>%
group_by(anon_id) %>%
arrange(week) %>%
mutate(week_number = row_number()) %>%
ungroup()
# Identify weeks just before injury
pre_injury_weeks <- weekly_sprint_counts %>%
group_by(anon_id) %>%
mutate(
next_injury = lead(injury),
is_pre_injury = ifelse(next_injury == 1, 1, 0)
) %>%
ungroup() %>%
filter(is_pre_injury == 1)
# Compare average sprint counts
pre_injury_weeks %>%
summarise(mean_pre_injury_sprints = mean(sprint_count, na.rm = TRUE))
weekly_sprint_counts %>%
filter(injury == 0) %>%
summarise(mean_non_injury_sprints = mean(sprint_count, na.rm = TRUE))
NA
ggplot(weekly_sprint_counts, aes(x = factor(injury), y = sprint_count, fill = factor(injury))) +
geom_boxplot() +
scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon")) +
labs(title = "Sprint Count Distribution by Injury Status", x = "Injury", y = "Sprint Count") +
theme_minimal()
# Create sum of weekly band totals
# Make sure week is defined
Catapult_Session_clean <- Catapult_Session_clean %>%
filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>%
mutate(week = floor_date(Date, unit = "week"))
# Sum efforts by week and athlete
weekly_velocity_efforts <- Catapult_Session_clean %>%
group_by(anon_id, week) %>%
summarise(
V2 = sum(Velocity.Band.2.Total.Effort.Count, na.rm = TRUE),
V3 = sum(Velocity.Band.3.Total.Effort.Count, na.rm = TRUE),
V4 = sum(Velocity.Band.4.Total.Effort.Count, na.rm = TRUE),
V5 = sum(Velocity.Band.5.Total.Effort.Count, na.rm = TRUE),
V6 = sum(Velocity.Band.6.Total.Effort.Count, na.rm = TRUE),
V7 = sum(Velocity.Band.7.Total.Effort.Count, na.rm = TRUE),
V8 = sum(Velocity.Band.8.Total.Effort.Count, na.rm = TRUE),
total_efforts = V2 + V3 + V4 + V5 + V6 + V7 + V8,
Weekly_Max_Velocity = max(Maximum.Velocity, na.rm = TRUE),
Player_Max_Velocity = first(Player.Max.Velocity),
.groups = "drop"
) %>%
mutate(
pct_of_max_velocity = (Weekly_Max_Velocity / Player_Max_Velocity) * 100
)
# Create percentage of weekly efforts in each band
weekly_velocity_efforts <- weekly_velocity_efforts %>%
mutate(
pct_V2 = (V2 / total_efforts) * 100,
pct_V3 = (V3 / total_efforts) * 100,
pct_V4 = (V4 / total_efforts) * 100,
pct_V5 = (V5 / total_efforts) * 100,
pct_V6 = (V6 / total_efforts) * 100,
pct_V7 = (V7 / total_efforts) * 100,
pct_V8 = (V8 / total_efforts) * 100
)
weekly_velocity_efforts <- weekly_velocity_efforts %>%
filter(total_efforts > 0)
# Plot for each player
# Loop over all players and display their plots
unique(weekly_velocity_efforts$anon_id) %>%
lapply(function(player_id) {
# Prepare the data for one player
player_weekly_data <- weekly_velocity_efforts %>%
filter(anon_id == player_id) %>%
select(week, pct_V2:pct_V8, pct_of_max_velocity) %>%
pivot_longer(
cols = starts_with("pct_V"),
names_to = "velocity_band",
values_to = "percent_effort"
) %>%
mutate(
velocity_band = factor(
velocity_band,
levels = paste0("pct_V", 8:2),
labels = paste0("V", 8:2)
)
)
# Skip empty data
if (nrow(player_weekly_data) == 0) return(NULL)
# Generate and print the plot
plot <- ggplot(player_weekly_data, aes(x = week)) +
geom_col(aes(y = percent_effort, fill = velocity_band), position = "stack") +
geom_line(aes(y = pct_of_max_velocity, group = 1), color = "black", size = 1.2) +
geom_point(aes(y = pct_of_max_velocity), color = "black", size = 2) +
scale_fill_manual(
values = c(
"V2" = "darkgreen",
"V3" = "green2",
"V4" = "greenyellow",
"V5" = "yellow",
"V6" = "orange",
"V7" = "tomato",
"V8" = "firebrick"
)
) +
scale_y_continuous(
name = "Band Distribution (%)",
sec.axis = sec_axis(~ ., name = "Weekly Max Velocity (% of PR)")
) +
labs(
title = paste("Velocity Band Effort % and Speed Trend for Player", player_id),
x = "Week",
fill = "Velocity Band"
) +
theme_classic()
print(plot) # Display plot
return(NULL)
})
[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
NULL
[[5]]
NULL
[[6]]
NULL
[[7]]
NULL
[[8]]
NULL
[[9]]
NULL
[[10]]
NULL
[[11]]
NULL
[[12]]
NULL
[[13]]
NULL
[[14]]
NULL
[[15]]
NULL
[[16]]
NULL
[[17]]
NULL
[[18]]
NULL
[[19]]
NULL
[[20]]
NULL
[[21]]
NULL
[[22]]
NULL
[[23]]
NULL
[[24]]
NULL
[[25]]
NULL
[[26]]
NULL
[[27]]
NULL
[[28]]
NULL
[[29]]
NULL
[[30]]
NULL
[[31]]
NULL
[[32]]
NULL
[[33]]
NULL
[[34]]
NULL
[[35]]
NULL
[[36]]
NULL
[[37]]
NULL
[[38]]
NULL
[[39]]
NULL
[[40]]
NULL
[[41]]
NULL
[[42]]
NULL
[[43]]
NULL
[[44]]
NULL
[[45]]
NULL
[[46]]
NULL
[[47]]
NULL
[[48]]
NULL
[[49]]
NULL
[[50]]
NULL
[[51]]
NULL
[[52]]
NULL
[[53]]
NULL
[[54]]
NULL
[[55]]
NULL
[[56]]
NULL
[[57]]
NULL
[[58]]
NULL
[[59]]
NULL
[[60]]
NULL
[[61]]
NULL
[[62]]
NULL
[[63]]
NULL
[[64]]
NULL
[[65]]
NULL
[[66]]
NULL
[[67]]
NULL
[[68]]
NULL
[[69]]
NULL
[[70]]
NULL
[[71]]
NULL
[[72]]
NULL
[[73]]
NULL
[[74]]
NULL
[[75]]
NULL
[[76]]
NULL
[[77]]
NULL
[[78]]
NULL
[[79]]
NULL
[[80]]
NULL
[[81]]
NULL
[[82]]
NULL
[[83]]
NULL
[[84]]
NULL
[[85]]
NULL
[[86]]
NULL
[[87]]
NULL
[[88]]
NULL
[[89]]
NULL
[[90]]
NULL
[[91]]
NULL
[[92]]
NULL
[[93]]
NULL
[[94]]
NULL
[[95]]
NULL
[[96]]
NULL
[[97]]
NULL
[[98]]
NULL
[[99]]
NULL
[[100]]
NULL
[[101]]
NULL
[[102]]
NULL
[[103]]
NULL
[[104]]
NULL
# Pick one player
player_id <- "ID_93"
# Filter data for that player and weeks
player_data <- weekly_velocity_efforts %>%
filter(anon_id == player_id)
# Plot of Total Sprinting Efforts per Week
ggplot(player_data, aes(x = week, y = total_efforts)) +
geom_line(color = "#CFB87C", size = 1) +
geom_point(color = "#CFB87C", size = 2) +
labs(
title = paste("Total Sprinting Efforts per Week for Player", player_id),
x = "Week",
y = "Total Sprinting Efforts"
) +
theme_classic()
#Plot of Sprint Effort Distribution by Velocity Band
# Reshape absolute counts to long format
player_counts_long <- player_data %>%
select(week, V2, V3, V4, V5, V6, V7, V8) %>%
pivot_longer(
cols = V2:V8,
names_to = "velocity_band",
values_to = "effort_count"
) %>%
mutate(
velocity_band = factor(velocity_band, levels = paste0("V", 8:2))
)
ggplot(player_counts_long, aes(x = week, y = effort_count, fill = velocity_band)) +
geom_col(position = "stack") +
scale_fill_brewer(palette = "Set2") +
labs(
title = paste("Sprint Effort Distribution by Velocity Band for Player", player_id),
x = "Week",
y = "Sprint Effort Count",
fill = "Velocity Band"
) +
theme_classic()
# Select relevant columns
cor_data <- weekly_velocity_efforts %>%
select(pct_of_max_velocity, pct_V2, pct_V3, pct_V4, pct_V5, pct_V6, pct_V7, pct_V8)
# Compute correlation matrix
cor_matrix <- cor(cor_data, use = "pairwise.complete.obs")
cor_matrix
pct_of_max_velocity pct_V2 pct_V3 pct_V4 pct_V5
pct_of_max_velocity 1.00000000 -0.5213861 0.05869457 0.4422664 0.4482732
pct_V2 -0.52138614 1.0000000 -0.15363378 -0.8795243 -0.8605602
pct_V3 0.05869457 -0.1536338 1.00000000 0.3272849 -0.2133725
pct_V4 0.44226643 -0.8795243 0.32728485 1.0000000 0.6831749
pct_V5 0.44827321 -0.8605602 -0.21337249 0.6831749 1.0000000
pct_V6 0.41150125 -0.7765560 -0.37704110 0.4754871 0.8333338
pct_V7 0.33727593 -0.6129867 -0.43386873 0.2712374 0.5853417
pct_V8 0.33778410 -0.5101834 -0.42332967 0.1965351 0.4713043
pct_V6 pct_V7 pct_V8
pct_of_max_velocity 0.4115013 0.3372759 0.3377841
pct_V2 -0.7765560 -0.6129867 -0.5101834
pct_V3 -0.3770411 -0.4338687 -0.4233297
pct_V4 0.4754871 0.2712374 0.1965351
pct_V5 0.8333338 0.5853417 0.4713043
pct_V6 1.0000000 0.8067562 0.6559091
pct_V7 0.8067562 1.0000000 0.8731903
pct_V8 0.6559091 0.8731903 1.0000000
V4 and V5 appear to be slightly more predictive of top-speed achievement than V6, V7 or V8 — possibly because they’re more frequently reached zones.
V2 has a moderately strong negative linear relationship with reaching top-speed. When athletes spend more effort in this low-speed band, their weekly top speed (as % of max) tends to be lower.
# Linear model with all bands predicting pct_of_max_velocity
model_all_bands <- lm(
pct_of_max_velocity ~ pct_V2 + pct_V3 + pct_V4 + pct_V5 + pct_V6 + pct_V7 + pct_V8,
data = weekly_velocity_efforts
)
summary(model_all_bands)
Call:
lm(formula = pct_of_max_velocity ~ pct_V2 + pct_V3 + pct_V4 +
pct_V5 + pct_V6 + pct_V7 + pct_V8, data = weekly_velocity_efforts)
Residuals:
Min 1Q Median 3Q Max
-60.296 -4.649 1.038 5.861 24.418
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 296.9092 32.0021 9.278 < 2e-16 ***
pct_V2 -2.3814 0.3220 -7.396 2.11e-13 ***
pct_V3 -1.8554 0.3234 -5.737 1.12e-08 ***
pct_V4 -1.9669 0.3371 -5.834 6.35e-09 ***
pct_V5 -1.8813 0.3535 -5.322 1.15e-07 ***
pct_V6 -1.7832 0.3502 -5.092 3.90e-07 ***
pct_V7 -3.1034 0.5924 -5.238 1.80e-07 ***
pct_V8 NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9.707 on 1872 degrees of freedom
Multiple R-squared: 0.2859, Adjusted R-squared: 0.2836
F-statistic: 124.9 on 6 and 1872 DF, p-value: < 2.2e-16
# Compute change in top speeds week-to-week
weekly_velocity_efforts <- weekly_velocity_efforts %>%
arrange(anon_id, week) %>%
group_by(anon_id) %>%
mutate(
pct_of_max_velocity_change = pct_of_max_velocity - lag(pct_of_max_velocity)
) %>%
ungroup()
# Scatterplot with smoothing line for V8 band effort
# Create V8 data set that only includes when someone was in V8 for a given week
V8 <- weekly_velocity_efforts %>%
filter(pct_V8 > 0)
# Plot
ggplot(V8, aes(x = pct_V8, y = pct_of_max_velocity_change)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "loess", se = TRUE, color = "darkred") +
labs(
title = "Change in Max Speed vs. V8 Band Usage",
x = "Percent of Total Effort in V8 Band (%)",
y = "Change in Max Speed from Prior Week (%)"
) +
theme_classic()
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 60 rows containing non-finite outside the scale range (`stat_smooth()`).
Warning: Removed 60 rows containing missing values or values outside the scale range
(`geom_point()`).
weekly_velocity_efforts <- weekly_velocity_efforts %>%
group_by(anon_id) %>%
mutate(
lag_V2 = lag(V2),
lag_V3 = lag(V3),
lag_V4 = lag(V4),
lag_V5 = lag(V5),
lag_V6 = lag(V6),
lag_V7 = lag(V7),
lag_V8 = lag(V8),
lag_pct_V2 = lag(pct_V2),
lag_pct_V3 = lag(pct_V3),
lag_pct_V4 = lag(pct_V4),
lag_pct_V5 = lag(pct_V5),
lag_pct_V6 = lag(pct_V6),
lag_pct_V7 = lag(pct_V7),
lag_pct_V8 = lag(pct_V8)
) %>%
ungroup()
# Maybe group by position
# Get Player positions
player_positions <- Catapult_Session_clean %>%
select(anon_id, Primary.Position) %>%
distinct()
# Join Position into weekly_velocity_efforts
weekly_velocity_efforts <- weekly_velocity_efforts %>%
left_join(player_positions, by = "anon_id")
Warning in left_join(., player_positions, by = "anon_id") :
Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 310 of `x` matches multiple rows in `y`.
ℹ Row 78 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to
silence this warning.
# Position groups
weekly_velocity_efforts <- weekly_velocity_efforts %>%
mutate(
Position_Group = case_when(
Primary.Position %in% c("QB", "LB", "TE", "RB") ~ "COMBO",
Primary.Position %in% c("OL", "DL", "DE") ~ "BIGS",
Primary.Position %in% c("WR", "DB", "DB, WR") ~ "SKILL",
TRUE ~ "OTHER"
)
) %>%
filter(Position_Group != "OTHER")
ggplot(weekly_velocity_efforts, aes(x = Position_Group, y = pct_of_max_velocity_change)) +
geom_boxplot(fill = "skyblue", outlier.color = "red", outlier.size = 1.5) +
labs(
title = "Weekly Change in % of Max Velocity by Position",
x = "Position",
y = "% Change in Max Velocity"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Warning: Removed 105 rows containing non-finite outside the scale range (`stat_boxplot()`).
weekly_band_effort_by_group <- weekly_velocity_efforts %>%
group_by(week, Position_Group) %>%
summarise(
mean_V2 = mean(V2, na.rm = TRUE),
mean_V3 = mean(V3, na.rm = TRUE),
mean_V4 = mean(V4, na.rm = TRUE),
mean_V5 = mean(V5, na.rm = TRUE),
mean_V6 = mean(V6, na.rm = TRUE),
mean_V7 = mean(V7, na.rm = TRUE),
mean_V8 = mean(V8, na.rm = TRUE),
mean_lag_V2 = mean(lag_V2, na.rm = TRUE),
mean_lag_V3 = mean(lag_V3, na.rm = TRUE),
mean_lag_V4 = mean(lag_V4, na.rm = TRUE),
mean_lag_V5 = mean(lag_V5, na.rm = TRUE),
mean_lag_V6 = mean(lag_V6, na.rm = TRUE),
mean_lag_V7 = mean(lag_V7, na.rm = TRUE),
mean_lag_V8 = mean(lag_V8, na.rm = TRUE),
mean_pct_V2 = mean(pct_V2, na.rm = TRUE),
mean_pct_V3 = mean(pct_V3, na.rm = TRUE),
mean_pct_V4 = mean(pct_V4, na.rm = TRUE),
mean_pct_V5 = mean(pct_V5, na.rm = TRUE),
mean_pct_V6 = mean(pct_V6, na.rm = TRUE),
mean_pct_V7 = mean(pct_V7, na.rm = TRUE),
mean_pct_V8 = mean(pct_V8, na.rm = TRUE),
mean_weekly_max_velocity = mean(Weekly_Max_Velocity, na.rm = TRUE),
mean_pct_max_velocity = mean(pct_of_max_velocity, na.rm = TRUE),
mean_pct_max_velocity_change = mean(pct_of_max_velocity_change, na.rm = TRUE),
n_players = n()
) %>%
ungroup()
`summarise()` has grouped output by 'week'. You can override using the `.groups`
argument.
# Pivot longer to get bands in one column for easier plotting
band_long <- weekly_band_effort_by_group %>%
pivot_longer(
cols = starts_with("mean_pct_V"),
names_to = "velocity_band",
values_to = "mean_pct_effort"
) %>%
mutate(
velocity_band = factor(velocity_band, levels = paste0("mean_pct_V", 2:8))
)
ggplot(band_long, aes(x = week, y = mean_pct_effort, color = velocity_band)) +
geom_line(size = 1) +
facet_wrap(~ Position_Group) +
labs(
title = "Weekly Mean % Effort in Velocity Bands by Position Group",
x = "Week",
y = "Mean % Effort",
color = "Velocity Band"
) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Plot mean weekly max velocity
ggplot(weekly_band_effort_by_group, aes(x = week, y = mean_weekly_max_velocity, color = Position_Group)) +
geom_line(size = 1.2) +
labs(
title = "Mean Weekly Max Velocity by Position Group",
x = "Week",
y = "Mean Weekly Max Velocity"
) +
theme_classic()
# Plot mean % max velocity change
ggplot(weekly_band_effort_by_group, aes(x = week, y = mean_pct_max_velocity_change, color = Position_Group)) +
geom_line(size = 1.2) +
labs(
title = "Mean % Change in Max Velocity by Position Group",
x = "Week",
y = "Mean % Change in Max Velocity"
) +
theme_classic()
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_line()`).
ggplot() +
# Plot the stacked lines for band effort %
geom_line(data = band_long, aes(x = week, y = mean_pct_effort, color = velocity_band), size = 1) +
geom_line(data = weekly_band_effort_by_group,
aes(x = week, y = mean_weekly_max_velocity),
color = "black", size = 1.2) +
facet_wrap(~ Position_Group) +
labs(
title = "Weekly Mean % Effort in Velocity Bands by Position Group\nwith Mean Weekly Max Velocity",
x = "Week",
y = "Mean % Effort",
color = "Velocity Band"
) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Modeling
# Separate by position group
bigs <- weekly_band_effort_by_group %>%
filter(Position_Group == "BIGS")
skill <- weekly_band_effort_by_group %>%
filter(Position_Group == "SKILL")
combo <- weekly_band_effort_by_group %>%
filter(Position_Group == "COMBO")
# Bigs Model
model_bigs <- lm(
mean_weekly_max_velocity ~ mean_lag_V2 + mean_lag_V3 + mean_lag_V4 + mean_lag_V5 + mean_lag_V6 + mean_lag_V7 + mean_lag_V8, data = bigs
)
summary(model_bigs)
Call:
lm(formula = mean_weekly_max_velocity ~ mean_lag_V2 + mean_lag_V3 +
mean_lag_V4 + mean_lag_V5 + mean_lag_V6 + mean_lag_V7 + mean_lag_V8,
data = bigs)
Residuals:
Min 1Q Median 3Q Max
-1.82215 -0.36391 -0.01408 0.48755 2.67702
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 15.0463239 0.4738307 31.755 <2e-16 ***
mean_lag_V2 0.0009756 0.0162578 0.060 0.953
mean_lag_V3 -0.0181296 0.0607335 -0.299 0.768
mean_lag_V4 0.0132060 0.0701266 0.188 0.852
mean_lag_V5 0.0981497 0.0987247 0.994 0.329
mean_lag_V6 -0.2698887 0.2211272 -1.221 0.233
mean_lag_V7 0.5314275 0.5568642 0.954 0.348
mean_lag_V8 0.6952574 0.5708037 1.218 0.234
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.9484 on 27 degrees of freedom
(1 observation deleted due to missingness)
Multiple R-squared: 0.4165, Adjusted R-squared: 0.2652
F-statistic: 2.753 on 7 and 27 DF, p-value: 0.02704
# Skill Model
model_skill <- lm(
mean_weekly_max_velocity ~ mean_lag_V2 + mean_lag_V3 + mean_lag_V4 + mean_lag_V5 + mean_lag_V6 + mean_lag_V7 + mean_lag_V8, data = skill
)
summary(model_skill)
Call:
lm(formula = mean_weekly_max_velocity ~ mean_lag_V2 + mean_lag_V3 +
mean_lag_V4 + mean_lag_V5 + mean_lag_V6 + mean_lag_V7 + mean_lag_V8,
data = skill)
Residuals:
Min 1Q Median 3Q Max
-4.3553 -0.4427 0.0654 0.6939 1.8460
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 17.094703 0.792893 21.560 <2e-16 ***
mean_lag_V2 -0.019243 0.019233 -1.001 0.326
mean_lag_V3 0.008077 0.049145 0.164 0.871
mean_lag_V4 0.017559 0.061649 0.285 0.778
mean_lag_V5 -0.011812 0.101622 -0.116 0.908
mean_lag_V6 0.164215 0.134077 1.225 0.231
mean_lag_V7 -0.180434 0.173843 -1.038 0.308
mean_lag_V8 0.212522 0.212439 1.000 0.326
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.269 on 28 degrees of freedom
(1 observation deleted due to missingness)
Multiple R-squared: 0.2548, Adjusted R-squared: 0.06852
F-statistic: 1.368 on 7 and 28 DF, p-value: 0.2574
# Combo Model
model_combo <- lm(
mean_weekly_max_velocity ~ mean_lag_V2 + mean_lag_V3 + mean_lag_V4 + mean_lag_V5 + mean_lag_V6 + mean_lag_V7 + mean_lag_V8, data = combo)
summary(model_combo)
Call:
lm(formula = mean_weekly_max_velocity ~ mean_lag_V2 + mean_lag_V3 +
mean_lag_V4 + mean_lag_V5 + mean_lag_V6 + mean_lag_V7 + mean_lag_V8,
data = combo)
Residuals:
Min 1Q Median 3Q Max
-1.8950 -0.3423 0.1298 0.4643 1.3618
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 17.85151 0.42494 42.009 <2e-16 ***
mean_lag_V2 0.01297 0.01314 0.987 0.332
mean_lag_V3 -0.04224 0.03598 -1.174 0.251
mean_lag_V4 0.01894 0.04929 0.384 0.704
mean_lag_V5 0.06684 0.08694 0.769 0.449
mean_lag_V6 -0.12756 0.16048 -0.795 0.434
mean_lag_V7 0.17000 0.27613 0.616 0.543
mean_lag_V8 -0.04655 0.26492 -0.176 0.862
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8189 on 27 degrees of freedom
(1 observation deleted due to missingness)
Multiple R-squared: 0.2282, Adjusted R-squared: 0.02812
F-statistic: 1.141 on 7 and 27 DF, p-value: 0.3681
# Group correlation:
# Select relevant variables
vars <- c("mean_weekly_max_velocity", "mean_lag_V2", "mean_lag_V3",
"mean_lag_V4", "mean_lag_V5", "mean_lag_V6", "mean_lag_V7", "mean_lag_V8")
# Correlation matrices
cor_bigs <- cor(bigs[, vars], use = "complete.obs")
cor_skill <- cor(skill[, vars], use = "complete.obs")
cor_combo <- cor(combo[, vars], use = "complete.obs")
round(cor_bigs, 2)
mean_weekly_max_velocity mean_lag_V2 mean_lag_V3 mean_lag_V4
mean_weekly_max_velocity 1.00 -0.43 -0.38 -0.21
mean_lag_V2 -0.43 1.00 0.98 0.86
mean_lag_V3 -0.38 0.98 1.00 0.93
mean_lag_V4 -0.21 0.86 0.93 1.00
mean_lag_V5 0.16 0.28 0.36 0.60
mean_lag_V6 0.20 0.06 0.11 0.28
mean_lag_V7 0.35 -0.09 -0.04 0.12
mean_lag_V8 0.35 0.07 0.11 0.18
mean_lag_V5 mean_lag_V6 mean_lag_V7 mean_lag_V8
mean_weekly_max_velocity 0.16 0.20 0.35 0.35
mean_lag_V2 0.28 0.06 -0.09 0.07
mean_lag_V3 0.36 0.11 -0.04 0.11
mean_lag_V4 0.60 0.28 0.12 0.18
mean_lag_V5 1.00 0.83 0.61 0.27
mean_lag_V6 0.83 1.00 0.88 0.40
mean_lag_V7 0.61 0.88 1.00 0.62
mean_lag_V8 0.27 0.40 0.62 1.00
round(cor_skill, 2)
mean_weekly_max_velocity mean_lag_V2 mean_lag_V3 mean_lag_V4
mean_weekly_max_velocity 1.00 0.16 0.17 0.18
mean_lag_V2 0.16 1.00 1.00 0.99
mean_lag_V3 0.17 1.00 1.00 1.00
mean_lag_V4 0.18 0.99 1.00 1.00
mean_lag_V5 0.22 0.98 0.99 0.99
mean_lag_V6 0.30 0.94 0.95 0.95
mean_lag_V7 0.31 0.61 0.63 0.63
mean_lag_V8 0.34 0.59 0.59 0.59
mean_lag_V5 mean_lag_V6 mean_lag_V7 mean_lag_V8
mean_weekly_max_velocity 0.22 0.30 0.31 0.34
mean_lag_V2 0.98 0.94 0.61 0.59
mean_lag_V3 0.99 0.95 0.63 0.59
mean_lag_V4 0.99 0.95 0.63 0.59
mean_lag_V5 1.00 0.97 0.64 0.61
mean_lag_V6 0.97 1.00 0.78 0.75
mean_lag_V7 0.64 0.78 1.00 0.94
mean_lag_V8 0.61 0.75 0.94 1.00
round(cor_combo, 2)
mean_weekly_max_velocity mean_lag_V2 mean_lag_V3 mean_lag_V4
mean_weekly_max_velocity 1.00 -0.34 -0.33 -0.28
mean_lag_V2 -0.34 1.00 0.99 0.97
mean_lag_V3 -0.33 0.99 1.00 0.99
mean_lag_V4 -0.28 0.97 0.99 1.00
mean_lag_V5 -0.16 0.81 0.85 0.92
mean_lag_V6 -0.09 0.56 0.59 0.64
mean_lag_V7 0.04 0.17 0.19 0.23
mean_lag_V8 0.05 0.19 0.22 0.25
mean_lag_V5 mean_lag_V6 mean_lag_V7 mean_lag_V8
mean_weekly_max_velocity -0.16 -0.09 0.04 0.05
mean_lag_V2 0.81 0.56 0.17 0.19
mean_lag_V3 0.85 0.59 0.19 0.22
mean_lag_V4 0.92 0.64 0.23 0.25
mean_lag_V5 1.00 0.83 0.46 0.43
mean_lag_V6 0.83 1.00 0.84 0.71
mean_lag_V7 0.46 0.84 1.00 0.89
mean_lag_V8 0.43 0.71 0.89 1.00
# Reduce for better modeling
bigs <- bigs %>%
mutate(
low_band = (mean_lag_V2 + mean_lag_V3) / 2,
mid_band = (mean_lag_V4 + mean_lag_V5 + mean_lag_V6) / 3,
high_band = (mean_lag_V7 + mean_lag_V8) / 2
)
model_bigs_simple <- lm(mean_weekly_max_velocity ~ low_band + mid_band + high_band, data = bigs)
summary(model_bigs_simple)
Call:
lm(formula = mean_weekly_max_velocity ~ low_band + mid_band +
high_band, data = bigs)
Residuals:
Min 1Q Median 3Q Max
-1.6146 -0.4911 -0.0142 0.3108 2.8070
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 15.125433 0.468820 32.263 <2e-16 ***
low_band -0.007572 0.003139 -2.412 0.022 *
mid_band 0.035703 0.034036 1.049 0.302
high_band 0.456591 0.319427 1.429 0.163
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.9489 on 31 degrees of freedom
(1 observation deleted due to missingness)
Multiple R-squared: 0.3294, Adjusted R-squared: 0.2645
F-statistic: 5.076 on 3 and 31 DF, p-value: 0.005645
# Skill Group
skill <- skill %>%
mutate(
low_band = (mean_lag_V2 + mean_lag_V3 + mean_lag_V4) / 3,
mid_band = (mean_lag_V5 + mean_lag_V6) / 2,
high_band = (mean_lag_V7 + mean_lag_V8) / 2
)
model_skill_simple <- lm(mean_weekly_max_velocity ~ low_band + mid_band + high_band, data = skill)
summary(model_skill_simple)
Call:
lm(formula = mean_weekly_max_velocity ~ low_band + mid_band +
high_band, data = skill)
Residuals:
Min 1Q Median 3Q Max
-4.5352 -0.4681 0.0355 0.6976 1.9090
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 16.94373 0.71506 23.695 <2e-16 ***
low_band -0.02104 0.01035 -2.033 0.0504 .
mid_band 0.13564 0.06723 2.018 0.0521 .
high_band 0.03801 0.08073 0.471 0.6410
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.221 on 32 degrees of freedom
(1 observation deleted due to missingness)
Multiple R-squared: 0.2121, Adjusted R-squared: 0.1382
F-statistic: 2.871 on 3 and 32 DF, p-value: 0.05162
# Combo group
combo <- combo %>%
mutate(
low_band = (mean_lag_V2 + mean_lag_V3) / 2,
mid_band = (mean_lag_V4 + mean_lag_V5) / 2,
high_band = (mean_lag_V6 + mean_lag_V7 + mean_lag_V8) / 3
)
model_combo_simple <- lm(mean_weekly_max_velocity ~ low_band + mid_band + high_band, data = combo)
summary(model_combo_simple)
Call:
lm(formula = mean_weekly_max_velocity ~ low_band + mid_band +
high_band, data = combo)
Residuals:
Min 1Q Median 3Q Max
-1.86355 -0.34167 0.06995 0.46065 1.47691
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.794e+01 3.691e-01 48.606 <2e-16 ***
low_band -7.850e-03 4.452e-03 -1.763 0.0877 .
mid_band 2.258e-02 1.956e-02 1.154 0.2573
high_band 2.952e-05 5.154e-02 0.001 0.9995
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.7945 on 31 degrees of freedom
(1 observation deleted due to missingness)
Multiple R-squared: 0.1658, Adjusted R-squared: 0.0851
F-statistic: 2.054 on 3 and 31 DF, p-value: 0.1267
Bigs model: Statistically signficant overall and explains around 33% of the variation of max velocity. The low band is significantly negative. Every unit increase in the low band results in a decrease in max velocity. The mid and high bands are not significant but the high band shows a positive trend. Based on this model, for bigs, more low-band effort may reduce weekly top speed, possibly indicating underexposure of high speeds. High-band efforts might help, but aren’t clearly impactful in this group.
Skill model: Almost significant. Low band has a negative effect, mid band has a positive effect, and high band is not significant. Based on this model, for skill players, more low-intensity exposure seems detrimental to max speed, while moderate band (V5–V6) exposure may enhance it.
Combo model: Model is not significant. The combo position group contains many different type of positions (QBs, RBs, TEs, LBs) with different movement and velocities. This variability may obscure relationships.
Across groups, low-band exposure is consistently negatively related to peak weekly speed — suggesting that too much low-intensity work may reduce the capacity to reach high speeds.
Look at ID_11’s plot from the last question. This player is an offensive lineman. Looking at the plot, he is mostly in band 2 and 3 while never reaching bands 7 or 8 and rarely reaching band 6. Despite this, this athlete’s weekly max velocity is always at least 75% of their all-time max velocity. Only looking at the absolute bands that are provided, we might come to the conclusion that ID_11 is not achieving high running speeds, however after looking at his max velocity efforts relative to his all-time max velocity, he is reaching high running speeds.
# Create Relative Bands for ID_11
# Prepare ID_11 dataset with relative bands
ID_11 <- Catapult_Session_clean %>%
filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>%
filter(anon_id == "ID_11", Maximum.Velocity != 0) %>%
filter(!is.na(Maximum.Velocity)) %>%
mutate(
week = floor_date(Date, unit = "week"),
pct_of_max = (Maximum.Velocity / Player.Max.Velocity) * 100,
# Create Relative Bands
relative_band = cut(
pct_of_max,
breaks = c(0, 40, 50, 60, 70, 80, 90, 100),
labels = c("V2 (<40%)", "V3 (40-50%)","V4 (50-60%)" ,"V5 (60-70%)", "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"),
right = TRUE, # Set to TRUE to include 100% in band
include.lowest = TRUE # Includes 0 in first band
),
# Reverse factor levels so highest band is on top in plot
relative_band = factor(
relative_band,
levels = rev(c("V2 (<40%)", "V3 (40-50%)","V4 (50-60%)" ,"V5 (60-70%)", "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"))
)
)
# Summarize counts per week and relative band
weekly_effort <- ID_11 %>%
group_by(week, relative_band) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(week) %>%
mutate(pct_effort = (count / sum(count)) * 100) %>%
ungroup()
# Pivot wider to get percentages per band as columns
weekly_effort_wide <- weekly_effort %>%
select(week, relative_band, pct_effort) %>%
pivot_wider(names_from = relative_band, values_from = pct_effort, values_fill = 0)
# Calculate weekly max velocity % of player max velocity
weekly_max_velocity <- ID_11 %>%
group_by(week) %>%
summarise(pct_of_max_velocity = max(pct_of_max), .groups = "drop")
# Join the max velocity % back to wide effort data
weekly_effort_wide <- weekly_effort_wide %>%
left_join(weekly_max_velocity, by = "week")
# Pivot longer for stacked bar plotting
weekly_effort_long <- weekly_effort_wide %>%
pivot_longer(
cols = c("V2 (<40%)", "V3 (40-50%)","V4 (50-60%)" ,"V5 (60-70%)", "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"),
names_to = "relative_band",
values_to = "pct_effort"
) %>%
mutate(
relative_band = factor(
relative_band,
levels = rev(c("V2 (<40%)", "V3 (40-50%)","V4 (50-60%)" ,"V5 (60-70%)", "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"))
)
)
# Plot Relative Bands for ID_11
ggplot(weekly_effort_long, aes(x = week, y = pct_effort, fill = relative_band)) +
geom_col(position = "stack") +
geom_line(aes(y = pct_of_max_velocity, group = 1),
color = "black", size = 1.2) +
geom_point(aes(y = pct_of_max_velocity),
color = "black", size = 2) +
scale_fill_manual(
values = c(
"V2 (<40%)" = "darkgreen",
"V3 (40-50%)" = "green2",
"V4 (50-60%)" = "greenyellow",
"V5 (60-70%)" = "yellow",
"V6 (70-80%)" = "orange",
"V7 (80-90%)" = "tomato",
"V8 (90-100%)" = "darkred"
)
) +
labs(
title = "Relative Velocity Band Effort % per Week, ID_11",
x = "Week",
y = "Percent of Weekly Efforts",
fill = "Relative Velocity Band"
) +
scale_y_continuous(
sec.axis = sec_axis(~ ., name = "Percent of Max Velocity")
) +
theme_classic()
Comparing this with the absolute bands graph of ID_11, we can see that this is a much better representation of their running speeds.
# Function to create relative bands and plot these for each athlete
# Arrange data so that the plot will display in anon_id order
clean_anons <- Catapult_Session_clean %>%
arrange(anon_id)
# Loop over all athletes and create their relative bands plots
unique(clean_anons$anon_id) %>%
lapply(function(player_id) {
# Prepare data for player
player_data <- Catapult_Session_clean %>%
filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>%
filter(anon_id == player_id, Maximum.Velocity != 0, !is.na(Maximum.Velocity)) %>%
mutate(
week = floor_date(Date, unit = "week"),
pct_of_max = (Maximum.Velocity / Player.Max.Velocity) * 100,
# Create Relative Bands
relative_band = cut(
pct_of_max,
breaks = c(0, 40, 50, 60, 70, 80, 90, 100),
labels = c("V2 (<40%)", "V3 (40-50%)", "V4 (50-60%)", "V5 (60-70%)",
"V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"),
right = TRUE, # Set to true to include 100%
include.lowest = TRUE # Set to true to include 0 in 1st interval
),
# Reverse factor levels so highest band is on top in plot
relative_band = factor(
relative_band,
levels = rev(c("V2 (<40%)", "V3 (40-50%)", "V4 (50-60%)", "V5 (60-70%)",
"V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"))
)
)
if (nrow(player_data) == 0) return(NULL) # skip empty
# Summarize counts per week and relative band
weekly_effort <- player_data %>%
group_by(week, relative_band) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(week) %>%
mutate(pct_effort = (count / sum(count)) * 100) %>%
ungroup()
# Pivot wider to get percentages per band as columns
weekly_effort_wide <- weekly_effort %>%
select(week, relative_band, pct_effort) %>%
pivot_wider(names_from = relative_band, values_from = pct_effort, values_fill = 0)
# Calculate weekly max velocity % of player max velocity
weekly_max_velocity <- player_data %>%
group_by(week) %>%
summarise(pct_of_max_velocity = max(pct_of_max), .groups = "drop")
# Join
weekly_effort_wide <- weekly_effort_wide %>%
left_join(weekly_max_velocity, by = "week")
# Pivot longer for stacked bar plotting
weekly_effort_long <- weekly_effort_wide %>%
pivot_longer(
cols = -c(week, pct_of_max_velocity),
names_to = "relative_band",
values_to = "pct_effort"
) %>%
mutate(
relative_band = factor(
relative_band,
levels = rev(c("V2 (<40%)", "V3 (40-50%)", "V4 (50-60%)", "V5 (60-70%)",
"V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"))
)
)
# Plot
plot <- ggplot(weekly_effort_long, aes(x = week, y = pct_effort, fill = relative_band)) +
geom_col(position = "stack") +
geom_line(aes(y = pct_of_max_velocity, group = 1), color = "black", size = 1.2) +
geom_point(aes(y = pct_of_max_velocity), color = "black", size = 2) +
scale_fill_manual(
values = c(
"V2 (<40%)" = "darkgreen",
"V3 (40-50%)" = "green2",
"V4 (50-60%)" = "greenyellow",
"V5 (60-70%)" = "yellow",
"V6 (70-80%)" = "orange",
"V7 (80-90%)" = "tomato",
"V8 (90-100%)" = "darkred"
)
) +
scale_y_continuous(
name = "Percent of Weekly Effort",
sec.axis = sec_axis(~ ., name = "Percent of Max Velocity")
) +
labs(
title = paste("Relative Velocity Band Effort % and Max Velocity Trend for Player", player_id),
x = "Week",
fill = "Relative Velocity Band"
) +
theme_classic()
print(plot)
return(NULL)
})
`geom_line()`: Each group consists of only one observation.
ℹ Do you need to adjust the group aesthetic?
`geom_line()`: Each group consists of only one observation.
ℹ Do you need to adjust the group aesthetic?
`geom_line()`: Each group consists of only one observation.
ℹ Do you need to adjust the group aesthetic?
`geom_line()`: Each group consists of only one observation.
ℹ Do you need to adjust the group aesthetic?
`geom_line()`: Each group consists of only one observation.
ℹ Do you need to adjust the group aesthetic?
[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
[[4]]
NULL
[[5]]
NULL
[[6]]
NULL
[[7]]
NULL
[[8]]
NULL
[[9]]
NULL
[[10]]
NULL
[[11]]
NULL
[[12]]
NULL
[[13]]
NULL
[[14]]
NULL
[[15]]
NULL
[[16]]
NULL
[[17]]
NULL
[[18]]
NULL
[[19]]
NULL
[[20]]
NULL
[[21]]
NULL
[[22]]
NULL
[[23]]
NULL
[[24]]
NULL
[[25]]
NULL
[[26]]
NULL
[[27]]
NULL
[[28]]
NULL
[[29]]
NULL
[[30]]
NULL
[[31]]
NULL
[[32]]
NULL
[[33]]
NULL
[[34]]
NULL
[[35]]
NULL
[[36]]
NULL
[[37]]
NULL
[[38]]
NULL
[[39]]
NULL
[[40]]
NULL
[[41]]
NULL
[[42]]
NULL
[[43]]
NULL
[[44]]
NULL
[[45]]
NULL
[[46]]
NULL
[[47]]
NULL
[[48]]
NULL
[[49]]
NULL
[[50]]
NULL
[[51]]
NULL
[[52]]
NULL
[[53]]
NULL
[[54]]
NULL
[[55]]
NULL
[[56]]
NULL
[[57]]
NULL
[[58]]
NULL
[[59]]
NULL
[[60]]
NULL
[[61]]
NULL
[[62]]
NULL
[[63]]
NULL
[[64]]
NULL
[[65]]
NULL
[[66]]
NULL
[[67]]
NULL
[[68]]
NULL
[[69]]
NULL
[[70]]
NULL
[[71]]
NULL
[[72]]
NULL
[[73]]
NULL
[[74]]
NULL
[[75]]
NULL
[[76]]
NULL
[[77]]
NULL
[[78]]
NULL
[[79]]
NULL
[[80]]
NULL
[[81]]
NULL
[[82]]
NULL
[[83]]
NULL
[[84]]
NULL
[[85]]
NULL
[[86]]
NULL
[[87]]
NULL
[[88]]
NULL
[[89]]
NULL
[[90]]
NULL
[[91]]
NULL
[[92]]
NULL
[[93]]
NULL
[[94]]
NULL
[[95]]
NULL
[[96]]
NULL
[[97]]
NULL
[[98]]
NULL
[[99]]
NULL
[[100]]
NULL
[[101]]
NULL
[[102]]
NULL
[[103]]
NULL
[[104]]
NULL
# Create relative bands for all athletes
# Define the velocity band labels
band_levels <- c("V2 (<40%)", "V3 (40-50%)", "V4 (50-60%)", "V5 (60-70%)",
"V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)")
# Create exposure data frame for all players
weekly_sprint_exposure <- Catapult_Session_clean %>%
filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>%
filter(Maximum.Velocity != 0, !is.na(Maximum.Velocity)) %>%
mutate(
week = floor_date(Date, unit = "week"),
pct_of_max = (Maximum.Velocity / Player.Max.Velocity) * 100,
relative_band = cut(
pct_of_max,
breaks = c(0, 40, 50, 60, 70, 80, 90, 100.1),
labels = band_levels,
right = TRUE, # will include 100
include.lowest = TRUE
),
relative_band = factor(relative_band, levels = band_levels)
) %>%
group_by(anon_id, week, relative_band) %>%
summarise(effort_count = n(), .groups = "drop") %>%
group_by(anon_id, week) %>%
mutate(
total_efforts = sum(effort_count),
pct_effort = (effort_count / total_efforts) * 100
) %>%
ungroup() %>%
select(anon_id, week, relative_band, pct_effort) %>%
pivot_wider(
names_from = relative_band,
values_from = pct_effort,
values_fill = 0
)
# Add weekly max velocity % of max
weekly_max_pct <- Catapult_Session_clean %>%
filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>%
filter(Maximum.Velocity != 0, !is.na(Maximum.Velocity)) %>%
mutate(
week = floor_date(Date, unit = "week"),
pct_of_max = (Maximum.Velocity / Player.Max.Velocity) * 100
) %>%
group_by(anon_id, week) %>%
summarise(pct_of_max_velocity = max(pct_of_max, na.rm = TRUE), .groups = "drop")
# Final data set
relative_bands <- weekly_sprint_exposure %>%
left_join(weekly_max_pct, by = c("anon_id", "week"))
# Convert to long for plotting
relative_bands_long <- relative_bands %>%
pivot_longer(
cols = starts_with("V"),
names_to = "relative_band",
values_to = "pct_effort"
) %>%
mutate(
relative_band = factor(relative_band, levels = band_levels)
)
COMBO <- c("QB","LB","TE","RB", "ILB")
BIG <- c("OL", "DL", "DE", "DT")
SKILL <- c("WR", "DB", "CB", "SAF")
Positions <- c("COMBO", "BIG", "SKILL")
weekly_sprint_exposure <- left_join(weekly_sprint_exposure,player_positions, by = "anon_id", relationship = "many-to-many")
weekly_sprint_exposure <- weekly_sprint_exposure %>%
mutate(Position = case_when(Primary.Position %in% COMBO ~ "COMBO",
Primary.Position %in% BIG ~ "BIG",
Primary.Position %in% SKILL ~ "SKILL"))
weekly_sprint_exposure <- distinct(weekly_sprint_exposure)
max_velocities <- weekly_velocity_efforts[,c("anon_id","week","pct_of_max_velocity")]
weekly_sprint_exposure <- left_join(weekly_sprint_exposure, max_velocities, by = c("anon_id", "week"), relationship="many-to-many")
sprint_exposure_big <- weekly_sprint_exposure %>%
filter(Position == "BIG") %>%
group_by(week) %>%
mutate(avg_pct_max = mean(pct_of_max_velocity),
avg_V8 = mean(`V8 (90-100%)`),
avg_V7 = mean(`V7 (80-90%)`),
avg_V6 = mean(`V6 (70-80%)`),
avg_V5 = mean(`V5 (60-70%)`),
avg_V4 = mean(`V4 (50-60%)`),
avg_V3 = mean(`V3 (40-50%)`),
avg_V2 = mean(`V2 (<40%)`)) %>%
ungroup() %>%
na.omit()
sprint_exposure_combo <- weekly_sprint_exposure %>%
filter(Position == "COMBO") %>%
group_by(week) %>%
mutate(avg_pct_max = mean(pct_of_max_velocity),
avg_V8 = mean(`V8 (90-100%)`),
avg_V7 = mean(`V7 (80-90%)`),
avg_V6 = mean(`V6 (70-80%)`),
avg_V5 = mean(`V5 (60-70%)`),
avg_V4 = mean(`V4 (50-60%)`),
avg_V3 = mean(`V3 (40-50%)`),
avg_V2 = mean(`V2 (<40%)`)) %>%
ungroup() %>%
na.omit()
sprint_exposure_skill <- weekly_sprint_exposure %>%
filter(Position == "SKILL") %>%
group_by(week) %>%
mutate(avg_pct_max = mean(pct_of_max_velocity),
avg_V8 = mean(`V8 (90-100%)`),
avg_V7 = mean(`V7 (80-90%)`),
avg_V6 = mean(`V6 (70-80%)`),
avg_V5 = mean(`V5 (60-70%)`),
avg_V4 = mean(`V4 (50-60%)`),
avg_V3 = mean(`V3 (40-50%)`),
avg_V2 = mean(`V2 (<40%)`)) %>%
ungroup() %>%
na.omit()
When I built the models that considered the relative bands instead of the absolute bands provided, I found that initially, the results were a lot better than the previous models with the absolute bands. This suggests that the relative bounds which are segmented into 10% chunks of all time maximum velocity for each player is a lot more indicative of effort and therefore their percentage of maximum velocity in a given week. We are able to see though that there is a lot of multicollinearity within the relative bands calculated.
# Modeling
# Bigs Model
model_big <- lm(avg_pct_max~avg_V2+avg_V3+avg_V4+avg_V5+avg_V6+avg_V7+avg_V8,
data=sprint_exposure_big)
summary(model_big)
Call:
lm(formula = avg_pct_max ~ avg_V2 + avg_V3 + avg_V4 + avg_V5 +
avg_V6 + avg_V7 + avg_V8, data = sprint_exposure_big)
Residuals:
Min 1Q Median 3Q Max
-8.5126 -1.9609 -0.5462 2.4394 8.2064
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 102.76569 2.84956 36.064 < 2e-16 ***
avg_V2 -0.19388 0.03814 -5.084 4.80e-07 ***
avg_V3 0.02051 0.03135 0.654 0.51327
avg_V4 -0.69959 0.07031 -9.951 < 2e-16 ***
avg_V5 -0.07283 0.02779 -2.621 0.00897 **
avg_V6 -0.20796 0.03023 -6.878 1.39e-11 ***
avg_V7 -0.23638 0.05199 -4.547 6.45e-06 ***
avg_V8 NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.616 on 672 degrees of freedom
Multiple R-squared: 0.4877, Adjusted R-squared: 0.4831
F-statistic: 106.6 on 6 and 672 DF, p-value: < 2.2e-16
# Skill Model
model_skill <- lm(avg_pct_max~avg_V2+avg_V3+avg_V4+avg_V5+avg_V6+avg_V7+avg_V8,
data=sprint_exposure_skill)
summary(model_skill)
Call:
lm(formula = avg_pct_max ~ avg_V2 + avg_V3 + avg_V4 + avg_V5 +
avg_V6 + avg_V7 + avg_V8, data = sprint_exposure_skill)
Residuals:
Min 1Q Median 3Q Max
-10.2750 -1.8666 0.6126 2.0181 5.9930
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 99.45646 1.01082 98.392 < 2e-16 ***
avg_V2 -0.13075 0.01824 -7.169 2.14e-12 ***
avg_V3 -0.24862 0.04422 -5.622 2.84e-08 ***
avg_V4 -0.03942 0.03601 -1.095 0.274
avg_V5 -0.02258 0.02978 -0.758 0.449
avg_V6 -0.15986 0.02145 -7.454 3.04e-13 ***
avg_V7 -0.16516 0.01571 -10.513 < 2e-16 ***
avg_V8 NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.218 on 625 degrees of freedom
Multiple R-squared: 0.2212, Adjusted R-squared: 0.2137
F-statistic: 29.59 on 6 and 625 DF, p-value: < 2.2e-16
# Combo Model
model_combo <- lm(avg_pct_max~avg_V2+avg_V3+avg_V4+avg_V5+avg_V6+avg_V7+avg_V8,
data=sprint_exposure_combo)
summary(model_combo)
Call:
lm(formula = avg_pct_max ~ avg_V2 + avg_V3 + avg_V4 + avg_V5 +
avg_V6 + avg_V7 + avg_V8, data = sprint_exposure_combo)
Residuals:
Min 1Q Median 3Q Max
-5.0603 -2.4193 0.2974 1.9203 5.5500
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 106.10858 1.87724 56.524 < 2e-16 ***
avg_V2 -0.37450 0.02241 -16.709 < 2e-16 ***
avg_V3 -0.11987 0.05064 -2.367 0.0183 *
avg_V4 -0.21829 0.03452 -6.324 5.92e-10 ***
avg_V5 0.05418 0.03485 1.554 0.1207
avg_V6 -0.43729 0.02543 -17.194 < 2e-16 ***
avg_V7 -0.21196 0.03118 -6.798 3.21e-11 ***
avg_V8 NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.911 on 473 degrees of freedom
Multiple R-squared: 0.5088, Adjusted R-squared: 0.5025
F-statistic: 81.65 on 6 and 473 DF, p-value: < 2.2e-16
Looking at the correlations between all of the predictors as well as the response variable in the model we can see that a lot of the predictors have super strong correlations which each other, some more so than they are with the response. This suggests that the issue with the models above is that multicollinearity is bogging them down and we are not seeing the true relationships between the predictors and the response.
cor(sprint_exposure_big[,13:20])
avg_pct_max avg_V8 avg_V7 avg_V6 avg_V5 avg_V4
avg_pct_max 1.0000000 0.4582520 0.6219636 0.3378674 -0.249229679 -0.6661297
avg_V8 0.4582520 1.0000000 0.6811350 0.1720535 -0.421487722 -0.5116290
avg_V7 0.6219636 0.6811350 1.0000000 0.5881781 -0.278503389 -0.9137944
avg_V6 0.3378674 0.1720535 0.5881781 1.0000000 0.187770159 -0.6677577
avg_V5 -0.2492297 -0.4214877 -0.2785034 0.1877702 1.000000000 0.2554871
avg_V4 -0.6661297 -0.5116290 -0.9137944 -0.6677577 0.255487107 1.0000000
avg_V3 -0.5089165 -0.7704102 -0.8339267 -0.6625206 0.006712757 0.7591349
avg_V2 -0.3944046 -0.6634281 -0.7750296 -0.6852893 -0.117623629 0.5880220
avg_V3 avg_V2
avg_pct_max -0.508916494 -0.3944046
avg_V8 -0.770410157 -0.6634281
avg_V7 -0.833926705 -0.7750296
avg_V6 -0.662520576 -0.6852893
avg_V5 0.006712757 -0.1176236
avg_V4 0.759134890 0.5880220
avg_V3 1.000000000 0.8431295
avg_V2 0.843129453 1.0000000
cor(sprint_exposure_skill[,13:20])
avg_pct_max avg_V8 avg_V7 avg_V6 avg_V5 avg_V4
avg_pct_max 1.00000000 0.3511269 -0.1404658 -0.01318182 0.07138302 -0.1093645
avg_V8 0.35112691 1.0000000 0.3053989 -0.16642717 -0.47290915 -0.6817410
avg_V7 -0.14046584 0.3053989 1.0000000 -0.11434145 -0.68601047 -0.6210217
avg_V6 -0.01318182 -0.1664272 -0.1143415 1.00000000 0.44178456 -0.1941185
avg_V5 0.07138302 -0.4729092 -0.6860105 0.44178456 1.00000000 0.4425883
avg_V4 -0.10936454 -0.6817410 -0.6210217 -0.19411846 0.44258834 1.0000000
avg_V3 -0.15731269 -0.6140748 -0.6027677 -0.30151118 0.35853718 0.7976768
avg_V2 -0.12376236 -0.5173474 -0.5929917 -0.40428088 0.13488291 0.5984834
avg_V3 avg_V2
avg_pct_max -0.1573127 -0.1237624
avg_V8 -0.6140748 -0.5173474
avg_V7 -0.6027677 -0.5929917
avg_V6 -0.3015112 -0.4042809
avg_V5 0.3585372 0.1348829
avg_V4 0.7976768 0.5984834
avg_V3 1.0000000 0.6423597
avg_V2 0.6423597 1.0000000
cor(sprint_exposure_combo[,13:20])
avg_pct_max avg_V8 avg_V7 avg_V6 avg_V5 avg_V4
avg_pct_max 1.00000000 0.54355422 0.2862058 -0.09870834 -0.06288482 -0.2881056
avg_V8 0.54355422 1.00000000 0.7214029 0.07718591 -0.58822845 -0.7411365
avg_V7 0.28620581 0.72140294 1.0000000 0.31120217 -0.60306684 -0.8384875
avg_V6 -0.09870834 0.07718591 0.3112022 1.00000000 0.19086044 -0.4571797
avg_V5 -0.06288482 -0.58822845 -0.6030668 0.19086044 1.00000000 0.3904188
avg_V4 -0.28810555 -0.74113653 -0.8384875 -0.45717969 0.39041882 1.0000000
avg_V3 -0.21041589 -0.68019434 -0.8727456 -0.50967887 0.41230683 0.8353128
avg_V2 -0.41135575 -0.66271739 -0.7462830 -0.64926319 0.16056174 0.6904312
avg_V3 avg_V2
avg_pct_max -0.2104159 -0.4113558
avg_V8 -0.6801943 -0.6627174
avg_V7 -0.8727456 -0.7462830
avg_V6 -0.5096789 -0.6492632
avg_V5 0.4123068 0.1605617
avg_V4 0.8353128 0.6904312
avg_V3 1.0000000 0.7217605
avg_V2 0.7217605 1.0000000
All of the models below are the best 3 predictor models that resulted for each position after running the best subsets algorithm. All of the following have reductions in adjusted-\(R^2\) but they don’t seem significant considering that we took out over half of the predictors and maintained a relatively similar adjusted-\(R^2\) value.
#Bigs
#best_sub_big <- regsubsets(avg_pct_max~avg_V2+avg_V3+avg_V4+avg_V5+avg_V6+avg_V7+avg_V8,
# data=sprint_exposure_big, method="exhaustive")
#summary(best_sub_big)
model_big <- lm(avg_pct_max~avg_V2+avg_V6+avg_V7,
data=sprint_exposure_big)
summary(model_big)
Call:
lm(formula = avg_pct_max ~ avg_V2 + avg_V6 + avg_V7, data = sprint_exposure_big)
Residuals:
Min 1Q Median 3Q Max
-7.9801 -2.6347 -0.0736 2.3776 8.9249
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 74.80853 0.70277 106.448 < 2e-16 ***
avg_V2 0.09301 0.02000 4.650 3.99e-06 ***
avg_V6 0.02148 0.02054 1.045 0.296
avg_V7 0.26614 0.01601 16.627 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.882 on 675 degrees of freedom
Multiple R-squared: 0.407, Adjusted R-squared: 0.4044
F-statistic: 154.4 on 3 and 675 DF, p-value: < 2.2e-16
#Skills
#best_sub_skill <- regsubsets(avg_pct_max~avg_V2+avg_V3+avg_V4+avg_V5+avg_V6+avg_V7+avg_V8,
# data=sprint_exposure_skill, method="exhaustive")
#summary(best_sub_skill)
model_skill <- lm(avg_pct_max~avg_V2+avg_V5+avg_V6,
data=sprint_exposure_skill)
summary(model_skill)
Call:
lm(formula = avg_pct_max ~ avg_V2 + avg_V5 + avg_V6, data = sprint_exposure_skill)
Residuals:
Min 1Q Median 3Q Max
-11.3474 -1.8114 0.4317 2.0136 8.1701
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 89.27952 0.50882 175.464 < 2e-16 ***
avg_V2 -0.06866 0.01423 -4.824 1.77e-06 ***
avg_V5 0.08903 0.02294 3.880 0.000115 ***
avg_V6 -0.07934 0.02200 -3.607 0.000335 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.558 on 628 degrees of freedom
Multiple R-squared: 0.04304, Adjusted R-squared: 0.03847
F-statistic: 9.415 on 3 and 628 DF, p-value: 4.303e-06
#Combos
#best_sub_combo <- regsubsets(avg_pct_max~avg_V2+avg_V3+avg_V4+avg_V5+avg_V6+avg_V7+avg_V8,
# data=sprint_exposure_combo, method="exhaustive")
#summary(best_sub_combo)
model_combo <- lm(avg_pct_max~avg_V2+avg_V4+avg_V5,
data=sprint_exposure_combo)
summary(model_combo)
Call:
lm(formula = avg_pct_max ~ avg_V2 + avg_V4 + avg_V5, data = sprint_exposure_combo)
Residuals:
Min 1Q Median 3Q Max
-8.9006 -2.3452 0.4021 2.7975 6.9917
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 87.075121 0.519840 167.504 < 2e-16 ***
avg_V2 -0.129116 0.018681 -6.912 1.54e-11 ***
avg_V4 -0.005862 0.032598 -0.180 0.857
avg_V5 0.004511 0.031992 0.141 0.888
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.773 on 476 degrees of freedom
Multiple R-squared: 0.1693, Adjusted R-squared: 0.164
F-statistic: 32.33 on 3 and 476 DF, p-value: < 2.2e-16
Since the models perform roughly the same with just 3 predictors as they do with all 7, it may make sense to instead truncate into more general bins. This may help us understand the relationship with low, medium and high effort with weekly maximum velocity.
sprint_exposure_big <- sprint_exposure_big %>%
mutate(avg_low = (avg_V2+avg_V3)/2,
avg_medium = (avg_V4+avg_V5+avg_V6)/3,
avg_high = (avg_V7+avg_V8)/2)
sprint_exposure_skill <- sprint_exposure_skill %>%
mutate(avg_low = (avg_V2+avg_V3)/2,
avg_medium = (avg_V4+avg_V5+avg_V6)/3,
avg_high = (avg_V7+avg_V8)/2)
sprint_exposure_combo <- sprint_exposure_combo %>%
mutate(avg_low = (avg_V2+avg_V3)/2,
avg_medium = (avg_V4+avg_V5+avg_V6)/3,
avg_high = (avg_V7+avg_V8)/2)
#bigs
model_big_general <- lm(avg_pct_max~avg_low+avg_medium+avg_high,
data=sprint_exposure_big)
summary(model_big_general)
Call:
lm(formula = avg_pct_max ~ avg_low + avg_medium + avg_high, data = sprint_exposure_big)
Residuals:
Min 1Q Median 3Q Max
-9.1915 -1.8353 -0.6574 2.5433 8.7859
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 91.03003 0.61509 147.99 <2e-16 ***
avg_low -0.21955 0.01360 -16.15 <2e-16 ***
avg_medium -0.46033 0.03631 -12.68 <2e-16 ***
avg_high NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4.013 on 676 degrees of freedom
Multiple R-squared: 0.3655, Adjusted R-squared: 0.3637
F-statistic: 194.7 on 2 and 676 DF, p-value: < 2.2e-16
#skills
model_skill_general <- lm(avg_pct_max~avg_low+avg_medium+avg_high,
data=sprint_exposure_skill)
summary(model_skill_general)
Call:
lm(formula = avg_pct_max ~ avg_low + avg_medium + avg_high, data = sprint_exposure_skill)
Residuals:
Min 1Q Median 3Q Max
-10.6931 -1.6260 0.7439 1.7465 8.5435
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 88.704219 0.469268 189.027 < 2e-16 ***
avg_low -0.068059 0.018507 -3.677 0.000256 ***
avg_medium 0.006289 0.028011 0.225 0.822417
avg_high NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.595 on 629 degrees of freedom
Multiple R-squared: 0.02135, Adjusted R-squared: 0.01824
F-statistic: 6.86 on 2 and 629 DF, p-value: 0.001129
#combos
model_combo_general <- lm(avg_pct_max~avg_low+avg_medium+avg_high,
data=sprint_exposure_combo)
summary(model_combo_general)
Call:
lm(formula = avg_pct_max ~ avg_low + avg_medium + avg_high, data = sprint_exposure_combo)
Residuals:
Min 1Q Median 3Q Max
-9.0768 -2.1421 0.4358 2.7035 7.2172
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 90.11335 0.67259 133.979 < 2e-16 ***
avg_low -0.14920 0.01902 -7.844 2.88e-14 ***
avg_medium -0.20606 0.04025 -5.119 4.45e-07 ***
avg_high NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.745 on 477 degrees of freedom
Multiple R-squared: 0.1799, Adjusted R-squared: 0.1764
F-statistic: 52.3 on 2 and 477 DF, p-value: < 2.2e-16
Still pretty bad, I am going to use principle components regression to see if there are other relationships we are missing
preds <- sprint_exposure_big[,14:20]
pc_loadings <- prcomp(preds, scale=TRUE)$rotation[,c(1,2,3)]
pc_loadings
PC1 PC2 PC3
avg_V8 0.35853037 -0.38392856 -0.5408522
avg_V7 0.44894926 -0.10872539 0.1594170
avg_V6 0.34066538 0.43413628 0.4134098
avg_V5 -0.07919446 0.77438791 -0.3281168
avg_V4 -0.41516958 0.05360132 -0.4830953
avg_V3 -0.44894420 -0.07815761 0.2248914
avg_V2 -0.41672437 -0.20891384 0.3457450
# Join relative_bands data set with injury data
Incident_dates <- Incident_Report_clean %>%
filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01"))
# Get ids of athletes with injury
injured_ids <- unique(Incident_dates$anon_id)
# Filter relative_bands to only include athletes who got injured
injured_data <- relative_bands_long %>%
filter(anon_id %in% injured_ids)
injuries_and_running <- injured_data %>%
left_join(injury_weeks, by = c("anon_id", "week")) %>%
mutate(injury = ifelse(is.na(injury), 0, injury))
# For demonstration: pick one player (or loop over them)
player_id <- "ID_306"
plot_data <- injuries_and_running %>%
filter(anon_id == player_id)
# Find injury weeks for this player
injury_week_nums <- plot_data %>% filter(injury == 1) %>% pull(week)
ggplot(plot_data, aes(x = week)) +
# Highlight injury week(s) as shaded rectangles
geom_rect(data = data.frame(week = injury_week_nums),
aes(xmin = week - 3, xmax = week + 3, ymin = -Inf, ymax = Inf),
fill = "blue", alpha = 0.2, inherit.aes = FALSE) +
geom_col(aes(y = pct_effort, fill = relative_band), position = "stack") +
geom_line(aes(y = pct_of_max_velocity, group = 1), color = "black", size = 1.2) +
geom_point(aes(y = pct_of_max_velocity), color = "black", size = 2) +
scale_fill_manual(
values = c(
"V2 (<40%)" = "darkgreen",
"V3 (40-50%)" = "green2",
"V4 (50-60%)" = "greenyellow",
"V5 (60-70%)" = "yellow",
"V6 (70-80%)" = "orange",
"V7 (80-90%)" = "tomato",
"V8 (90-100%)" = "darkred"
)
) +
labs(
title = paste("Relative Velocity Band Effort % and Max Velocity Trend for", player_id),
x = "Week",
y = "Percent of Weekly Efforts",
fill = "Relative Velocity Band"
) +
scale_y_continuous(
sec.axis = sec_axis(~ ., name = "Percent of Max Velocity")
) +
theme_classic()
# Loop over injured athletes and create their plots
unique(injuries_and_running$anon_id) %>%
lapply(function(player_id) {
# Filter data for this player
plot_data <- injuries_and_running %>%
filter(anon_id == player_id)
if (nrow(plot_data) == 0) return(NULL) # Skip if no data
# Get injury weeks
injury_week_nums <- plot_data %>%
filter(injury == 1) %>%
pull(week)
# Plot
plot <- ggplot(plot_data, aes(x = week)) +
# Highlight injury weeks with shaded blue areas
geom_rect(data = data.frame(week = injury_week_nums),
aes(xmin = week - 3, xmax = week + 3, ymin = -Inf, ymax = Inf),
fill = "blue", alpha = 0.2, inherit.aes = FALSE) +
geom_col(aes(y = pct_effort, fill = relative_band), position = "stack") +
geom_line(aes(y = pct_of_max_velocity, group = 1), color = "black", size = 1.2) +
geom_point(aes(y = pct_of_max_velocity), color = "black", size = 2) +
scale_fill_manual(
values = c(
"V2 (<40%)" = "darkgreen",
"V3 (40-50%)" = "green2",
"V4 (50-60%)" = "greenyellow",
"V5 (60-70%)" = "yellow",
"V6 (70-80%)" = "orange",
"V7 (80-90%)" = "tomato",
"V8 (90-100%)" = "darkred"
)
) +
scale_y_continuous(
name = "Percent of Weekly Effort",
sec.axis = sec_axis(~ ., name = "Percent of Max Velocity")
) +
labs(
title = paste("Velocity Band Exposure & Max % for Injured Player", player_id),
x = "Week",
fill = "Relative Velocity Band"
) +
theme_classic()
print(plot)
return(NULL)
})
Fix above code so that ID_95 last week isnt V2
model_injury <- glm(injury ~ pct_of_max_velocity,
data = injuries_and_running,
family = "binomial")
summary(model_injury)
# Get summary stats for both injury and non injury
# Clean and get date format
injuries_and_running_clean <- injuries_and_running %>%
mutate(week_formatted = format(as.Date(week), "%m-%d-%Y")) %>%
distinct(anon_id, week, .keep_all = TRUE)
# Filter for only injury weeks
injury_weeks <- injuries_and_running_clean %>%
filter(injury == 1) %>%
mutate(injury_event = paste0(anon_id, "__", week_formatted))
# Calculate statistics for injury weeks
injury_mean <- mean(injury_weeks$pct_of_max_velocity, na.rm = TRUE)
injury_summary_stats <- injury_weeks %>%
summarise(
mean_pct = mean(pct_of_max_velocity, na.rm = TRUE),
sd_pct = sd(pct_of_max_velocity, na.rm = TRUE),
n = sum(!is.na(pct_of_max_velocity))
)
# Confidence interval
se <- injury_summary_stats$sd_pct / sqrt(injury_summary_stats$n)
t_crit <- qt(0.975, df = injury_summary_stats$n - 1)
lower <- injury_summary_stats$mean_pct - t_crit * se
upper <- injury_summary_stats$mean_pct + t_crit * se
# Print
cat("95% CI for mean pct_of_max_velocity:", round(lower,2), "-", round(upper,2))
# Filter for non-injury weeks
non_injury_weeks <- injuries_and_running_clean %>%
filter(injury == 0)
# Get statistics for non-injury weeks
non_injury_mean <- mean(non_injury_weeks$pct_of_max_velocity, na.rm = TRUE)
non_injury_summary_stats <- non_injury_weeks %>%
summarise(
mean_pct = mean(pct_of_max_velocity, na.rm = TRUE),
sd_pct = sd(pct_of_max_velocity, na.rm = TRUE),
n = sum(!is.na(pct_of_max_velocity))
)
# Confidence interval
non_injury_se <- non_injury_summary_stats$sd_pct / sqrt(non_injury_summary_stats$n)
non_injury_t_crit <- qt(0.975, df = non_injury_summary_stats$n - 1)
non_injury_lower <- non_injury_summary_stats$mean_pct - non_injury_t_crit * non_injury_se
non_injury_upper <- non_injury_summary_stats$mean_pct + non_injury_t_crit * non_injury_se
# Print results
cat("95% CI for mean pct_of_max_velocity (non-injury weeks):",
round(non_injury_lower, 2), "-", round(non_injury_upper, 2))
# T-test to compare pct_of_max_velocity in injuries and non injuries that week
t_test <- t.test(
pct_of_max_velocity ~ injury,
data = injuries_and_running_clean,
var.equal = FALSE # use TRUE if you assume equal variances
)
# Print results
print(t_test)
While the mean % of max velocity is higher for the injury group, the difference is small and not significant.
# Plot of % Max Velocity during injury and non injury weeks
ggplot(injuries_and_running_clean, aes(x = week, y = pct_of_max_velocity)) +
geom_point(aes(color = factor(injury))) +
scale_color_manual(values = c("0" = "black", "1" = "red"),
labels = c("No Injury", "Injury"),
name = "Injury Status") +
labs(
title = "% of Max Velocity by Week",
subtitle = paste("Mean % of Max Velocity, Injury: ", round(injury_mean, 2), " | Mean % of Max Velocity, Non-Injury: ", round(non_injury_mean, 2)),
x = "Week",
y = "% of Max Velocity Reached"
) +
theme_classic()
ggplot(injury_weeks, aes(x = week, y = pct_of_max_velocity)) +
geom_point(color = "red") +
labs(
title = "% of Max Velocity During Injury Weeks",
subtitle = paste("% of Max Velocity Mean: ", round(injury_mean, 2)),
x = "Week",
y = "% of Max Velocity"
) +
theme_classic()
# Create bar chart
ggplot(injury_weeks, aes(x = factor(injury_event), y = pct_of_max_velocity)) +
geom_col(fill = "#CFB87C") +
geom_text(aes(label = round(pct_of_max_velocity, 1)), # round to 1 decimal place
vjust = -0.5, size = 2.75) +
labs(
title = "% of Max Velocity During Injury Weeks",
subtitle = paste("Mean: ", round(injury_mean, 2)),
x = "Injury Event",
y = "% of Max Velocity"
) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Looking into the week before injury
# Get % of max velocity for week prior
week_prior <- injuries_and_running_clean %>%
group_by(anon_id) %>%
arrange(week) %>%
mutate(
lag1_pct_of_max_velocity = lag(pct_of_max_velocity, 1),
lag2_pct_of_max_velocity = lag(pct_of_max_velocity, 2),
lag3_pct_of_max_velocity = lag(pct_of_max_velocity, 3),
change_last2weeks = lag1_pct_of_max_velocity - lag2_pct_of_max_velocity,
sum_last2weeks = lag1_pct_of_max_velocity + lag2_pct_of_max_velocity,
sum_last3weeks = lag1_pct_of_max_velocity + lag2_pct_of_max_velocity + lag3_pct_of_max_velocity
) %>%
ungroup()
# Get Injury Event variable and Injury dataset
injury_week_prior <- week_prior %>%
filter(injury == 1) %>%
mutate(injury_event = paste0(anon_id, "__", week_formatted))
# Get non-injury dataset
non_injury_week_prior <- week_prior %>%
filter(injury == 0)
# Get Means
injury_week_prior_mean <- mean(injury_week_prior$lag1_pct_of_max_velocity)
non_injury_week_prior_mean <- mean(non_injury_week_prior$lag1_pct_of_max_velocity)
injury_week_prior_mean
non_injury_week_prior_mean
# Plot of % Max Velocity during injury and non injury weeks
ggplot(week_prior, aes(x = week, y = lag1_pct_of_max_velocity)) +
geom_point(aes(color = factor(injury))) +
scale_color_manual(values = c("0" = "black", "1" = "red"),
labels = c("No Injury", "Injury"),
name = "Injury Status") +
labs(
title = "% of Max Velocity of Prior Week",
subtitle = paste("Mean % of Max Velocity, Injury: ", round(injury_week_prior_mean, 2), " | Mean % of Max Velocity, Non-Injury: ", round(non_injury_week_prior_mean, 2)),
x = "Week",
y = "% of Max Velocity Reached"
) +
theme_classic()
ggplot(injury_week_prior, aes(x = week, y = lag1_pct_of_max_velocity)) +
geom_point(color = "red") +
labs(
title = "% of Max Velocity the week before injury",
subtitle = paste("% of Max Velocity Mean: ", round(injury_week_prior_mean, 2)),
x = "Week",
y = "% of Max Velocity"
) +
theme_classic()
# Bar chart for week prior to injury % Max
ggplot(injury_week_prior, aes(x = factor(injury_event), y = lag1_pct_of_max_velocity)) +
geom_col(fill = "#CFB87C") +
geom_text(aes(label = round(pct_of_max_velocity, 1)), # round to 1 decimal place
vjust = -0.5, size = 2.75) +
labs(
title = "% of Max Velocity the Week Before Injury",
subtitle = paste("% Max Mean: ", round(injury_week_prior_mean, 2)),
x = "Injury Event",
y = "% of Max Velocity"
) +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# T-test to compare lag_pct_of_max_velocity (the week before) between injury and non injury
t_test_week_prior <- t.test(
lag1_pct_of_max_velocity ~ injury,
data = week_prior,
var.equal = FALSE # use TRUE if you assume equal variances
)
# Print results
print(t_test_week_prior)
Again, we do not reject the null that the means of the two groups are different.
# Look into the last two weeks
# Drop NAs
lag_2_injury <- injury_week_prior %>%
drop_na(lag2_pct_of_max_velocity)
lag_2_non_injury <- non_injury_week_prior %>%
drop_na(lag2_pct_of_max_velocity)
# Calculate means
mean_lag2_injury <- mean(lag_2_injury$lag2_pct_of_max_velocity)
mean_lag2_non_injury <- mean(lag_2_non_injury$lag2_pct_of_max_velocity)
mean_lag2_injury
mean_lag2_non_injury
# T-test to compare lag2_pct_of_max_velocity (the week before) between injury and non injury
t_test_2week_prior <- t.test(
lag2_pct_of_max_velocity ~ injury,
data = week_prior,
var.equal = FALSE # use TRUE if you assume equal variances
)
# Print results
print(t_test_2week_prior)
# T-test to compare lag3_pct_of_max_velocity between injury and non injury
t_test_3week_prior <- t.test(
lag3_pct_of_max_velocity ~ injury,
data = week_prior,
var.equal = FALSE # use TRUE if you assume equal variances
)
# Print results
print(t_test_3week_prior)
t_test_2week_change <- t.test(
change_last2weeks ~ injury,
data = week_prior,
var.equal = FALSE # use TRUE if you assume equal variances
)
# Print results
print(t_test_2week_change)
t_test_2week_change <- t.test(
sum_last2weeks ~ injury,
data = week_prior,
var.equal = FALSE # use TRUE if you assume equal variances
)
# Print results
print(t_test_2week_change)
t_test_3week_change <- t.test(
sum_last3weeks ~ injury,
data = week_prior,
var.equal = FALSE # use TRUE if you assume equal variances
)
# Print results
print(t_test_3week_change)
None of these showed significantly different means
#removing extra data
remove(band_long, bigs, clean_anons, combo, cor_data, cor_matrix, daily_90_counts,
full_grid, hit_90_counts, ID_11, Incident_dates, injured_data, injuries_and_running,
injury_by_group, injury_weeks, max_velocities, model_all_bands, model_big,
model_bigs, model_combo, model_skill, player_counts_long, player_data,
player_positions, plot_data, plots_by_position, position_averages,
position_averages_with_team, pre_injury_weeks, QBs, relative_bands,
relative_bands_long, skill, sprint_counts, sprint_exposure_big,
sprint_exposure_combo, sprint_exposure_skill, sprint_injury_table, V8,
weekly_90_counts, weekly_band_effort_by_group, weekly_effort, weekly_effort_long,
weekly_effort_wide, weekly_max_pct, weekly_max_velocity, weekly_sprint_counts,
weekly_sprint_counts_lagged, weekly_sprint_exposure, weekly_velocity_efforts)
#removing extra values and functions
remove(all_athletes, all_weeks, band_levels, BIG, COMBO, SKILL, injured_ids,
injury_week_nums, overall_avg, player_id, positions, Positions, qb_avg, team_avg,
plot_hit_90_by_position)
catapult_ids <- unique(Catapult_Session_clean$anon_id)
Catapult_Session_clean <- Catapult_Session_clean %>%
group_by(anon_id) %>%
mutate(week = floor_date(Date, unit="week")) %>%
ungroup() %>%
group_by(anon_id, week) %>%
mutate(Week.Max.Velocity = mean(na.omit(Session.Max.Velocity)))
kendalls <- rep(NA, 104)
negative_trend <- "0"
for(i in 1:104){
speeds <- unique(Catapult_Session_clean[Catapult_Session_clean$anon_id==catapult_ids[i],]$Week.Max.Velocity)
weeks <- rev(1:length(speeds))
kend <- cor(weeks, speeds, method="kendall")
if(length(speeds)>5){
kendalls[i] = kend
if(kend <= -0.5){
negative_trend <- c(negative_trend, catapult_ids[i])
}
p <- ggplot(filter(Catapult_Session_clean, anon_id == catapult_ids[i]), aes(week,
Week.Max.Velocity)) +
geom_point(color=ifelse(kend <= -0.5, "red", "black")) +
geom_line(color=ifelse(kend <= -0.5, "red", "black")) +
labs(title = "Average Top Running Speed per Week", subtitle=kend) +
ylim(0,25)
print(p)
}
}
hist(kendalls)
abline(v=-0.5)
abline(v=0.5)
abline(v=median(na.omit(kendalls)))
Based on the Kendall Correlations calculated from the average maximum velocity per week for each player we can see that the distributions of correlations shows that most players tend to have a weak Kendall Correlation coefficient. This suggests that player may tend to see a decrease in maximum velocity per week throughout their time at CU. For most players with a negative coefficient though, it is considered really weak. The players with a strong negative correlation coefficient are ID 30, ID 178, ID 220, and ID 65. These are the ones that have almost a definitive negative trend in their maximum running speeds. These players are the only ones that we can say for certain have seen a decrease in their top running speed.
data_130 <- Catapult_Session_clean %>%
filter(anon_id == "ID_130")
data_178 <- Catapult_Session_clean %>%
filter(anon_id == "ID_178")
data_220 <- Catapult_Session_clean %>%
filter(anon_id == "ID_220")
data_65 <- Catapult_Session_clean %>%
filter(anon_id == "ID_65")
model_130 <- lm(Week.Max.Velocity ~ week, data=data_130)
summary(model_130)
Call:
lm(formula = Week.Max.Velocity ~ week, data = data_130)
Residuals:
Min 1Q Median 3Q Max
-3.8907 -0.1691 0.0532 0.3117 0.6947
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 705.159994 39.547783 17.83 <2e-16 ***
week -0.034219 0.001962 -17.45 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.7347 on 245 degrees of freedom
Multiple R-squared: 0.554, Adjusted R-squared: 0.5522
F-statistic: 304.3 on 1 and 245 DF, p-value: < 2.2e-16
model_178 <- lm(Week.Max.Velocity ~ week, data=data_178)
summary(model_178)
Call:
lm(formula = Week.Max.Velocity ~ week, data = data_178)
Residuals:
Min 1Q Median 3Q Max
-4.1797 -0.3056 0.2855 0.3749 1.6381
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 608.681042 57.354398 10.61 <2e-16 ***
week -0.029414 0.002845 -10.34 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.9921 on 238 degrees of freedom
Multiple R-squared: 0.31, Adjusted R-squared: 0.3071
F-statistic: 106.9 on 1 and 238 DF, p-value: < 2.2e-16
model_220 <- lm(Week.Max.Velocity ~ week, data=data_220)
summary(model_220)
Call:
lm(formula = Week.Max.Velocity ~ week, data = data_220)
Residuals:
Min 1Q Median 3Q Max
-4.5179 -0.2466 0.3286 0.9217 2.2527
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.498e+03 1.022e+02 14.65 <2e-16 ***
week -7.368e-02 5.073e-03 -14.52 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.368 on 172 degrees of freedom
Multiple R-squared: 0.5508, Adjusted R-squared: 0.5482
F-statistic: 210.9 on 1 and 172 DF, p-value: < 2.2e-16
model_65 <- lm(Week.Max.Velocity ~ week, data=data_65)
summary(model_65)
Call:
lm(formula = Week.Max.Velocity ~ week, data = data_65)
Residuals:
Min 1Q Median 3Q Max
-12.2451 -0.6397 0.2826 0.5492 2.2876
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 905.887494 59.118723 15.32 <2e-16 ***
week -0.044850 0.002963 -15.13 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.314 on 479 degrees of freedom
Multiple R-squared: 0.3235, Adjusted R-squared: 0.3221
F-statistic: 229.1 on 1 and 479 DF, p-value: < 2.2e-16
Looking at the slope coefficients for players that had the lowest correlation coefficients we can see that all of them had a relatively small slope value. All of them being lower than 0.04 in absolute value. This suggests that while they had a detectable negative trend throughout their time at CU, none of them had any sharp decreases.
Looking at team data as a whole, since January 1, 2024 there is absolute no deviance from 0. That means that since January 1, 2024, the team has had the same average running imbalance of 0. This makes sense given that the team is so large and that imbalances can go from -100% to 100%. This suggests that throughout this time, at no point was there a team sway to one side. There also weren’t any points since January 1, 2024 that the team had any large spikes in average absolute value of running imbalance. This suggests that at no point throughout the season were there larger spikes than normal in running imbalance. Each player tends to have a very unique trend in their running imbalance. Looking at how the team varies but also at how each player varies throughout the season, it’s hard to make out any pattern that’s applicable to most people. The variance of running imbalance varies greatly between each player. Instead we looked at the variances between players who were injured and those who were not. Based on three different bootstrapped findings, we can see that the variances between players who were injured and those who were not were statistically significant.
For the first bootstrap, we compared the variance of the pooled groups meaning that the variance in running imbalance for players who were injured and those who were not were compared. This resulted in a 90% confidence interval which suggested that the difference in variance between the two groups is between 0.30 and 3.45. This suggests that when looking at the variance of the two groups separately but all the players are pooled together, the variances will most likely be different by factor between 0.30 and 3.45 and the variance for the injured pool will be greater than that of the uninjured pool. For the second bootstrap, each player’s variance was taken individually. This unpooled approach was taken to see if an individual player’s variance in running imbalance could potentially be related to HSI risk. The bootstrap algorithm in this case took the averaged variances of the bootstrapped sample for each group and compared them. This bootstrap produced a 90% confidence interval for the difference in average variance between the two groups is 0.79 to 1.32. These results suggest that players who sustained a hamstring injury since January 1, 2024 had, on average, a greater variance in their running imbalance by about 1.06. This suggests that there is a relationship between variance in running imbalance and HSI risk. This found increase in variability will be used to address the following questions. For the third bootstrap, we wanted to see if there was a difference in the average mean absolute value in running imbalance between players who were and weren’t injured. The bootstrapping algorithm for this test calculated the average absolute distance value or each running imbalance measurement and found the average for each sample. This test found that at the 90% significance level, injured players had an average running imbalance absolute value between 0.06 and 0.32 greater than their uninjured counterparts. These values though, when we consider that the range of running imbalance goes from 0 to 100 is small and may be hard to detect when out in the field.
We also looked at the relationship between running imbalance and higher level position. From this analysis we found that there doesn’t seem to be a super strong relationship between the three categories and average running imbalance variance. The bootstrap revealed that there are potentially significant differences between those who are Bigs and Combos. But, those who are Skills weren’t able to differentiate themselves between the two groups. Along with this, we looked at the average absolute value in running imbalance for the three groups. This analysis told us that while Combos and Bigs tended to have the same average absolute value running imbalance, Skills had a significantly higher average absolute running imbalance value. We can see from the very last chart in this analysis that Skills make up the most of those with hamstring injuries followed closely by Combos and Bigs making up around half of the amount of Skills. This is interesting considering that the amounts of Bigs, Skills, and Combos within the Historical Running data set are all roughly the same.
#team variation
Historical_Running_clean %>%
summarize(Team_Variation = var(Running.Imbalance))
#individual player variation
Historical_Running_clean %>%
group_by(anon_id) %>%
summarize(Player_Variation = var(na.omit(Running.Imbalance))) %>%
ungroup()
#average variance in running imbalance across all players
Historical_Running_clean %>%
group_by(anon_id) %>%
mutate(Player.var = var(na.omit(Running.Imbalance))) %>%
ungroup() %>%
summarize(Average_Player_Variance = mean(na.omit(Player.var)))
#making variance and average absolute value for each date to see trends
Historical_Running_clean <- Historical_Running_clean %>%
group_by(Date) %>%
mutate(Date.Variance = var(na.omit(Running.Imbalance)),
Date.Avg.Abs.Value = mean(abs(na.omit(Running.Imbalance)))) %>%
ungroup()
#calculating mean and variance for team data
team_mean <- mean(Historical_Running_clean$Running.Imbalance)
team_sd <- sd(Historical_Running_clean$Running.Imbalance)
#making scatter plot of team running imbalance data throughout season
ggplot(Historical_Running_clean, aes(Date, Running.Imbalance)) +
geom_point(alpha = 0.3) +
geom_hline(yintercept = team_mean, color = "#CFB87C") +
geom_hline(yintercept = team_mean + team_sd) +
geom_hline(yintercept = team_mean - team_sd) +
geom_hline(yintercept = team_mean + (2*team_sd), color = "#A2A4A3") +
geom_hline(yintercept = team_mean - (2*team_sd), color = "#A2A4A3") +
geom_smooth(method = "lm", se = TRUE, color = "#CFB87C") +
labs(title = "Team Running Imbalance Since January 1, 2024", y="Running Imbalance (%)",
subtitle = "\u03BC = 0.08623412, \u03C3^2 = 14.94215")
#making scatter plot of team running imbalance data throughout season
ggplot(Historical_Running_clean, aes(Date, Running.Imbalance)) +
geom_point(alpha = 0.3) +
geom_hline(yintercept = team_mean, color = "#CFB87C") +
geom_smooth(method = "lm", se = TRUE, color = "#CFB87C") +
geom_line(aes(x=Date, y=Date.Avg.Abs.Value), color = "#CFB87C") +
geom_line(aes(x=Date, y=-Date.Avg.Abs.Value), color = "#CFB87C") +
labs(title = "Team Running Imbalance Since January 1, 2024", y="Running Imbalance (%)",
subtitle = "\u03BC = 0.08623412, \u03C3^2 = 14.94215")
#making histogram of team running imbalance data
ggplot(Historical_Running_clean, aes(Running.Imbalance)) +
geom_histogram() +
labs(title = "Team Running Imbalance Since January 1, 2024", x="Running Imbalance")
#making histogram for running imbalance of all injured athletes
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,], aes(Running.Imbalance)) +
geom_histogram(fill = "#CFB87C", alpha = 0.75) +
#adding in 95% confidence interval
geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,]$Running.Imbalance, 0.025), color = "#CFB87C") +
geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,]$Running.Imbalance, 0.975), color = "#CFB87C") +
xlim(-21,21) +
labs(title = "Running Imbalance for Players with HSI since January 1, 2024")
#making histogram for running imbalance of all uninjured athletes
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,], aes(Running.Imbalance)) +
geom_histogram(fill = "black", alpha = 0.75) +
#adding in 95% confidence interval
geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,]$Running.Imbalance, 0.025)) +
geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,]$Running.Imbalance, 0.975)) +
xlim(-21,21) +
labs(title = "Running Imbalance for Players without HSI since January 1, 2024")
#Plotting injured and uninjured histograms over top one another
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,], aes(Running.Imbalance)) +
geom_histogram(alpha = 0.75) +
#adding in 95% CI for uninjured players
geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,]$Running.Imbalance, 0.05)) +
geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,]$Running.Imbalance, 0.95)) +
geom_histogram(data = Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,], aes(Running.Imbalance), fill = "#CFB87C", alpha = 0.75) +
#adding in 95% CI for injured players
geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,]$Running.Imbalance, 0.05), color = "#CFB87C") +
geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,]$Running.Imbalance, 0.95), color = "#CFB87C") +
xlim(-21,21) +
labs(title = "Running Imbalance for Players with and without HSI since January 1, 2024")
#making scatter plot of running imbalance data for injured players
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,], aes(Date, Running.Imbalance)) +
geom_point(alpha = 0.3) +
ylim(-21,21) +
labs(title = "Running Imbalance for Players with HSI since January 1, 2024")
#making scatter plot of running imbalance for uninjured players
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,], aes(Date, Running.Imbalance)) +
geom_point(alpha = 0.3) +
ylim(-21,21) +
labs(title = "Running Imbalance for Players without HSI since January 1, 2024")
#Splitting up the data sets and calculating player variance and measurement absolute value
injured_data <- Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,] %>%
mutate(Player.Absolute.Dist = abs(Running.Imbalance)) %>%
group_by(anon_id) %>%
mutate(Player.Variance = var(Running.Imbalance)) %>%
ungroup()
uninjured_data <- Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,] %>%
mutate(Player.Absolute.Dist = abs(Running.Imbalance)) %>%
group_by(anon_id) %>%
mutate(Player.Variance = var(Running.Imbalance)) %>%
ungroup()
#making a data frame to hold all of the within group variances
group_variances <- data.frame(injured_var = rep(NA,5000),
uninjured_var = rep(NA,5000),
diff_in_var = rep(NA, 5000))
#bootstrap for variances, 1000 iterations
for(i in 1:5000){
#random seed
set.seed(i)
#taking samples from each of the data sets, same number of rows, replacement true
injured_sample <- sample_n(injured_data, replace = TRUE, size = 1672)
uninjured_sample <- sample_n(uninjured_data, replace=TRUE, size = 2391)
#storing the calculated variances in data frame
group_variances[i,1] = var(injured_sample$Running.Imbalance)
group_variances[i,2] = var(uninjured_sample$Running.Imbalance)
group_variances[i,3] = group_variances[i,1] - group_variances[i,2]
}
ggplot(data=group_variances, aes(diff_in_var)) +
geom_histogram() +
geom_vline(xintercept = quantile(group_variances$diff_in_var, 0.05), color= "#CFB87C") +
geom_vline(xintercept = quantile(group_variances$diff_in_var, 0.95), color= "#CFB87C") +
labs(x="Difference in Variance")
ggplot(data=group_variances) +
geom_histogram(aes(injured_var), alpha = 0.75, fill ="#CFB87C") +
geom_histogram(aes(uninjured_var), alpha = 0.75) +
labs(x="Variance")
#making a data frame to hold all of the average player variances between groups
mean_player_variances <- data.frame(injured_var = rep(NA,5000),
uninjured_var = rep(NA,5000),
diff_in_var = rep(NA, 5000))
for(i in 1:5000){
#random seed
set.seed(i)
#taking samples from each of the data sets, same number of rows, replacement true
injured_sample <- sample_n(injured_data, replace = TRUE, size = 1672)
uninjured_sample <- sample_n(uninjured_data, replace=TRUE, size = 2391)
#storing the calculated variances in data frame
mean_player_variances[i,1] = mean(na.omit(injured_sample$Player.Variance))
mean_player_variances[i,2] = mean(na.omit(uninjured_sample$Player.Variance))
mean_player_variances[i,3] = mean_player_variances[i,1] - mean_player_variances[i,2]
}
ggplot(data = mean_player_variances, aes(diff_in_var)) +
geom_histogram() +
geom_vline(xintercept = quantile(mean_player_variances$diff_in_var, 0.05), color= "#CFB87C") +
geom_vline(xintercept = quantile(mean_player_variances$diff_in_var, 0.95), color= "#CFB87C") +
labs(x="Difference in Variance")
ggplot(data=mean_player_variances) +
geom_histogram(aes(injured_var), alpha = 0.75, fill ="#CFB87C") +
geom_histogram(aes(uninjured_var), alpha = 0.75) +
labs(x="Average Variance")
#making a data frame to hold all of the average absolute differences from 0
group_distance <- data.frame(injured_dist = rep(NA,5000),
uninjured_dist = rep(NA,5000),
diff_in_dist = rep(NA, 5000))
#bootstrap for variances, 5000 iterations
for(i in 1:5000){
#random seed
set.seed(i)
#taking samples from each of the data sets, same number of rows, replacement true
injured_sample <- sample_n(injured_data, replace = TRUE, size = 1658)
uninjured_sample <- sample_n(uninjured_data, replace=TRUE, size = 2405)
#storing the calculated variances in data frame
group_distance[i,1] = mean(injured_sample$Player.Absolute.Dist)
group_distance[i,2] = mean(uninjured_sample$Player.Absolute.Dist)
group_distance[i,3] = group_distance[i,1] - group_distance[i,2]
}
ggplot(data = group_distance, aes(diff_in_dist)) +
geom_histogram() +
geom_vline(xintercept = quantile(group_distance$diff_in_dist, 0.05), color= "#CFB87C") +
geom_vline(xintercept = quantile(group_distance$diff_in_dist, 0.95), color= "#CFB87C") +
labs(x="Difference in Distance")
ggplot(data = group_distance) +
geom_histogram(aes(injured_dist), alpha = 0.75, fill ="#CFB87C") +
geom_histogram(aes(uninjured_dist), alpha = 0.75) +
labs(x="Average Absolute Value")
#removing junk that came from the loops
remove(i,team_mean, team_sd, injured_sample, uninjured_sample, group_variances, injured_data, uninjured_data, mean_player_variances, group_distance)
#making lists to sort position into larger categories
COMBO <- c("QB","LB","TE","RB", "ILB")
BIG <- c("OL", "DL", "DE", "DT")
SKILL <- c("WR", "DB", "CB", "SAF")
Positions <- c("COMBO", "BIG", "SKILL")
#giving positions to incident report
Incident_Report_clean <- Incident_Report_clean %>%
mutate(Specific.Position = Position,
Position = case_when(Specific.Position %in% COMBO ~ "COMBO",
Specific.Position %in% BIG ~ "BIG",
Specific.Position %in% SKILL ~ "SKILL"))
#giving positions to catapult session
Catapult_Session_clean <- Catapult_Session_clean %>%
mutate(Specific.Position = Primary.Position,
Position = case_when(Primary.Position %in% COMBO ~ "COMBO",
Primary.Position %in% BIG ~ "BIG",
Primary.Position %in% SKILL ~ "SKILL"))
#only taking IDs and position names and categories
incident_info <- Incident_Report_clean[,c("anon_id", "Position", "Specific.Position")]
catapult_info <- Catapult_Session_clean[,c("anon_id", "Position", "Specific.Position")]
#comprehensive list of IDs, their position, and category
info <- distinct(rbind(incident_info, catapult_info))
#add this only historical running
Historical_Running_clean <- left_join(Historical_Running_clean, info, by="anon_id",
relationship="many-to-many") %>%
#calculating each player's variance in running imbalance
group_by(anon_id) %>%
mutate(Player.Variance = var(Running.Imbalance)) %>%
ungroup()
for(i in 1:3){
p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$Position==Positions[i],],
aes(Running.Imbalance)) +
geom_histogram() +
labs(subtitle=Positions[i])
print(p)
}
Player_Summary_Stats <- distinct(Historical_Running_clean[,c("anon_id", "Position",
"Specific.Position",
"Player.Variance")])
for(i in 1:3){
p <- ggplot(data=Player_Summary_Stats[Player_Summary_Stats$Position==Positions[i],],
aes(Player.Variance)) +
geom_histogram() +
labs(subtitle=Positions[i])
print(p)
}
#splitting up data set into the different categories
COMBOS <- Player_Summary_Stats %>%
filter(Position == "COMBO") %>%
na.omit()
SKILLS <- Player_Summary_Stats %>%
filter(Position == "SKILL") %>%
na.omit()
BIGS <- Player_Summary_Stats %>%
filter(Position == "BIG") %>%
na.omit()
group_avg_variance <- data.frame(COMBO_var = rep(NA, 5000),
SKILL_var = rep(NA, 5000),
BIG_var = rep(NA, 5000))
for(i in 1:5000){
set.seed(i)
combo_sample <- sample_n(COMBOS, size=20, replace=TRUE)
skill_sample <- sample_n(SKILLS, size=22, replace=TRUE)
big_sample <- sample_n(BIGS, size=24, replace=TRUE)
group_avg_variance[i,1] <- mean(na.omit(combo_sample$Player.Variance))
group_avg_variance[i,2] <- mean(na.omit(skill_sample$Player.Variance))
group_avg_variance[i,3] <- mean(na.omit(big_sample$Player.Variance))
}
ggplot(data=group_avg_variance, aes(COMBO_var)) +
geom_histogram() +
geom_vline(xintercept = quantile(group_avg_variance$COMBO_var, 0.05)) +
geom_vline(xintercept = quantile(group_avg_variance$COMBO_var, 0.95))
ggplot(data=group_avg_variance, aes(SKILL_var)) +
geom_histogram() +
geom_vline(xintercept = quantile(group_avg_variance$SKILL_var, 0.05)) +
geom_vline(xintercept = quantile(group_avg_variance$SKILL_var, 0.95))
ggplot(data=group_avg_variance, aes(BIG_var)) +
geom_histogram() +
geom_vline(xintercept = quantile(group_avg_variance$BIG_var, 0.05)) +
geom_vline(xintercept = quantile(group_avg_variance$BIG_var, 0.95))
ggplot(data=group_avg_variance) +
geom_histogram(aes(COMBO_var), alpha=0.5, fill="red") +
geom_histogram(aes(SKILL_var), alpha=0.5, fill="blue") +
geom_histogram(aes(BIG_var), alpha=0.5, fill="yellow")
var_confints <- data.frame(Category = c("COMBO", "SKILL", "BIG"),
Lower_Bound = c(quantile(group_avg_variance$COMBO_var, 0.05),
quantile(group_avg_variance$SKILL_var, 0.05),
quantile(group_avg_variance$BIG_var, 0.05)),
Median = c(quantile(group_avg_variance$COMBO_var, 0.5),
quantile(group_avg_variance$SKILL_var, 0.5),
quantile(group_avg_variance$BIG_var, 0.5)),
Upper_Bound = c(quantile(group_avg_variance$COMBO_var, 0.95),
quantile(group_avg_variance$SKILL_var, 0.95),
quantile(group_avg_variance$BIG_var, 0.95)))
head(var_confints)
ggplot(data=var_confints, aes(x=Category, y=Median, ymin=Lower_Bound, ymax=Upper_Bound)) +
geom_pointrange() +
labs(y="Running Imbalance Variance 90% CI")
Historical_Running_clean <- Historical_Running_clean %>%
mutate(Abs.Value.Running.Imbalance = abs(Running.Imbalance))
COMBOS <- Historical_Running_clean %>%
filter(Position == "COMBO")
SKILLS <- Historical_Running_clean %>%
filter(Position == "SKILL")
BIGS <- Historical_Running_clean %>%
filter(Position == "BIG")
group_avg_dist <- data.frame(COMBO_dist = rep(NA, 5000),
SKILL_dist = rep(NA, 5000),
BIG_dist = rep(NA, 5000))
for(i in 1:5000){
set.seed(i)
combo_sample <- sample_n(COMBOS, size=1203, replace=TRUE)
skill_sample <- sample_n(SKILLS, size=1635, replace=TRUE)
big_sample <- sample_n(BIGS, size=1224, replace=TRUE)
group_avg_dist[i,1] <- mean(na.omit(combo_sample$Abs.Value.Running.Imbalance))
group_avg_dist[i,2] <- mean(na.omit(skill_sample$Abs.Value.Running.Imbalance))
group_avg_dist[i,3] <- mean(na.omit(big_sample$Abs.Value.Running.Imbalance))
}
ggplot(data=group_avg_dist) +
geom_histogram(aes(COMBO_dist), alpha=0.5, fill="red") +
geom_histogram(aes(SKILL_dist), alpha=0.5, fill="blue") +
geom_histogram(aes(BIG_dist), alpha=0.5, fill="yellow") +
labs(x="Average Absolute Value Running Imbalance")
dist_confints <- data.frame(Category = c("COMBO", "SKILL", "BIG"),
Lower_Bound = c(quantile(group_avg_dist$COMBO_dist, 0.05),
quantile(group_avg_dist$SKILL_dist, 0.05),
quantile(group_avg_dist$BIG_dist, 0.05)),
Median = c(quantile(group_avg_dist$COMBO_dist, 0.5),
quantile(group_avg_dist$SKILL_dist, 0.5),
quantile(group_avg_dist$BIG_dist, 0.5)),
Upper_Bound = c(quantile(group_avg_dist$COMBO_dist, 0.95),
quantile(group_avg_dist$SKILL_dist, 0.95),
quantile(group_avg_dist$BIG_dist, 0.95)))
head(dist_confints)
ggplot(data=dist_confints, aes(x=Category, y=Median, ymin=Lower_Bound, ymax=Upper_Bound)) +
geom_pointrange() +
labs(y="Running Imbalance Absolute Value 90% CI")
Injury_Incidents <- distinct(Incident_Report_clean[,c("anon_id", "Position", "Date.of.Injury")])
Position_counts <- na.omit(distinct(Historical_Running_clean[,c("anon_id", "Position")]))
ggplot(Injury_Incidents, aes(Position)) +
geom_bar()
ggplot(Position_counts, aes(Position)) +
geom_bar()
remove(big_sample, BIGS, catapult_info, combo_sample, COMBOS, confints, group_avg_variance,
incident_info, info, p, Player_Summary_Stats, skill_sample, SKILLS, BIG, COMBO, i,
Positions, SKILL, dist_confints, group_avg_variance, group_avg_dist, group_distance, group_variances,
injured_data, injured_sample, Injury_Incidents, mean_player_variances,
Position_counts, var_confints)
Based on the analysis below, there doesn’t seem to be any major discernible differences in running imbalance before or following an injury. This suggests that there may not be a direct link between HSI risk and running imbalance value directly beforehand. Instead, the trends of many players who were injured seems to have a relatively consistent trend not entirely dependent on time. Looking at summary statistics of running imbalance in the weeks leading up to and following a hamstring injury, there are also no glaring trends. For this analysis, we looked at the mean and variance in running imbalance per week leading up to and after an injury for all of the injured players with running imbalance data. This showed us that there is no clear indicator of HSI risk in running imbalance or any summary statistic of it. Instead it may be more useful to look at each player’s total running imbalance and their individual variance. This seems to be more of a useful tool for differentiating between injured and uninjured athletes.
#getting running imbalances for just injured players
Injured_Historical_Running <- Historical_Running_clean %>%
filter(anon_id %in% injured_IDs)
#making new column to represent when in time injury would be, negative means before injury and positive means after injury, 0 means date of injury if there's data for that day
Injured_Historical_Running$Weeks.After.Injury <- rep(NA, 1658)
Injured_Historical_Running$Injury.Count <- rep(NA, 1658)
#making new column in incident report for the injury count
Incident_Report_clean$Injury.Count <- rep(NA, 122)
#go through all of the injured players in the data set
for(i in 1:22){
#get the dates each player was injured
injury_dates <- unique(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)
#go through all of the dates in which the player had an injury
for(j in 1:length(injury_dates)){
#calculate dates for 1, 2, 3, 4 weeks before and after each injury date
past_1 <- injury_dates[j]-7
past_2 <- injury_dates[j]-14
past_3 <- injury_dates[j]-21
past_4 <- injury_dates[j]-28
future_1 <- injury_dates[j]+7
future_2 <- injury_dates[j]+14
future_3 <- injury_dates[j]+21
future_4 <- injury_dates[j]+28
#Calculating how many injuries this is for the player
injury_count <- as.character((length(injury_dates)) - j + 1)
#compare date of data point for each player to date of injury, store in Weeks.After.Injury column, store injury count
#first week after injury
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date > injury_dates[j] &
Injured_Historical_Running$Date<=future_1,]$Weeks.After.Injury <- "1"
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date > injury_dates[j] &
Injured_Historical_Running$Date<=future_1,]$Injury.Count <- injury_count
#second week after injury
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date > future_1 &
Injured_Historical_Running$Date<=future_2,]$Weeks.After.Injury <- "2"
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date > future_1 &
Injured_Historical_Running$Date<=future_2,]$Injury.Count <- injury_count
#third week after injury
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date > future_2 &
Injured_Historical_Running$Date<=future_3,]$Weeks.After.Injury <- "3"
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date > future_2 &
Injured_Historical_Running$Date<=future_3,]$Injury.Count <- injury_count
#fourth week after injury
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date > future_3 &
Injured_Historical_Running$Date<=future_4,]$Weeks.After.Injury <- "4"
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date > future_3 &
Injured_Historical_Running$Date<=future_4,]$Injury.Count <- injury_count
#week right before injury
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date < injury_dates[j] &
Injured_Historical_Running$Date>=past_1,]$Weeks.After.Injury <- "-1"
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date < injury_dates[j] &
Injured_Historical_Running$Date>=past_1,]$Injury.Count <- injury_count
#two weeks before injury
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date < past_1 &
Injured_Historical_Running$Date>=past_2,]$Weeks.After.Injury <- "-2"
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date < past_1 &
Injured_Historical_Running$Date>=past_2,]$Injury.Count <- injury_count
#three weeks before injury
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date < past_2 &
Injured_Historical_Running$Date>=past_3,]$Weeks.After.Injury <- "-3"
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date < past_2 &
Injured_Historical_Running$Date>=past_3,]$Injury.Count <- injury_count
#four weeks before injury
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date < past_3 &
Injured_Historical_Running$Date>=past_4,]$Weeks.After.Injury <- "-4"
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date < past_3 &
Injured_Historical_Running$Date>=past_4,]$Injury.Count <- injury_count
#Date of Injury
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date == injury_dates[j],]$Weeks.After.Injury <- "0"
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
Injured_Historical_Running$Date == injury_dates[j],]$Injury.Count <- injury_count
#adding injury count to indicent report
Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i] & Incident_Report_clean$Date.of.Injury==injury_dates[j],]$Injury.Count <- injury_count
}
}
#making weeks after injury and injury count into a factor and combining data sets back together
Injured_Historical_Running <- Injured_Historical_Running %>%
mutate(Weeks.After.Injury = factor(Weeks.After.Injury),
Injury.Count = factor(Injury.Count))
Historical_Running_clean <- left_join(Historical_Running_clean, Injured_Historical_Running)
#getting rid of junk that was from the loop
remove(future_1, future_2, future_3, future_4, i, injury_count, injury_dates, j, past_1, past_2, past_3, past_4)
remove(Injured_Historical_Running)
for(i in 1:22){
p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id==injured_IDs[i],], aes(Date, Running.Imbalance)) +
geom_line() +
geom_point(aes(color=Weeks.After.Injury)) +
geom_vline(xintercept = Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury) +
scale_color_manual(values = c("-4"="green", "-3"="yellow", "-2"="orange", "-1"="red", "0"="black","1"= "purple", "2"="navy", "3"="blue", "4"="skyblue")) +
theme_minimal() +
labs(title="Running Imbalance", subtitle = injured_IDs[i])
print(p)
}
#making summary statistics of running imbalance per week relative to injury
Historical_Running_clean <- Historical_Running_clean %>%
group_by(anon_id, Injury.Count, Weeks.After.Injury) %>%
mutate(Weeks.After.Injury.Variability = var(Running.Imbalance),
Weeks.After.Injury.Mean = mean(abs(Running.Imbalance))) %>%
ungroup()
for(i in 1:22){
#looking at mean running imbalance per week before and after injury for each injured player
p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == injured_IDs[i],], aes(Date, Weeks.After.Injury.Mean, group=Injury.Count)) +
geom_line() +
geom_point(aes(color=Weeks.After.Injury)) +
xlim(min(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)-30, max(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)+30) +
geom_vline(xintercept = Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury) +
labs(title="Average Absolute Distance per Week",injured_IDs[i]) +
scale_color_manual(values = c("-4"="green", "-3"="yellow", "-2"="orange", "-1"="red", "0"="black","1"= "purple", "2"="navy", "3"="blue", "4"="skyblue"))
print(p)
}
for(i in 1:22){
#looking at variance in running imbalance per week before and after injury for each injured player
p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == injured_IDs[i],], aes(Date, Weeks.After.Injury.Variability, group=Injury.Count)) +
geom_line() +
geom_point(aes(color=Weeks.After.Injury)) +
xlim(min(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)-30, max(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)+30) +
geom_vline(xintercept = Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury) +
labs(title="Variance in Running Imbalance per Week", subtitle=injured_IDs[i]) +
scale_color_manual(values = c("-4"="green", "-3"="yellow", "-2"="orange", "-1"="red", "0"="black","1"= "purple", "2"="navy", "3"="blue", "4"="skyblue"))
print(p)
}
for(i in 1:71){
#only plotting if there are over 15 data points for each player
if(nrow(Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],])>15){
#calculating how strong of a non linear trend there is using a gam
df <- summary(gam(Running.Imbalance~s(Days.Since.Start),
data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],]))$edf
#calculating how strong of a linear trend there is using kendall correlation
Kendall_Cor <- cor(x=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],]$Days.Since.Start, y=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],]$Running.Imbalance, method="kendall")
#if there is a detectable linear relationship in the data
if(abs(Kendall_Cor>0.2) & df<=3){ #linear trend in data
p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],], aes(Date, Running.Imbalance)) +
geom_line() +
geom_point() +
geom_smooth(color= "red",
method="lm",
se=FALSE) +
labs(title="Running Imbalance Since January 1, 2024 (linear trend)", subtitle=all_IDs[i],) +
xlim(as.Date("2025-01-01", "%Y-%m-%d"), as.Date("2025-05-01", "%Y-%m-%d"))
print(p)
}
#if there is a detectable non-linear trend in the data
if(df > 3){ #non linear trend in data
p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],], aes(Date, Running.Imbalance)) +
geom_line() +
geom_point() +
geom_smooth(color = "skyblue",
se=FALSE) +
labs(title="Running Imbalance Since January 1, 2024 (non-linear trend)", subtitle=all_IDs[i],) +
xlim(as.Date("2025-01-01", "%Y-%m-%d"), as.Date("2025-05-01", "%Y-%m-%d"))
print(p)
}
#if there is no detectable trend in the data
if(df<=3 & abs(Kendall_Cor)<=0.2){
p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],], aes(Date, Running.Imbalance)) +
geom_line() +
geom_point() +
labs(title="Running Imbalance Since January 1, 2024 (non-linear trend)", subtitle=all_IDs[i],) +
xlim(as.Date("2025-01-01", "%Y-%m-%d"), as.Date("2025-05-01", "%Y-%m-%d"))
}
}
}
#plotted only from January 1, 2025 to May 1, 2025
trends <- data.frame(ID = all_IDs,
KC = rep(0, 71))
for(i in 1:71){
#only plotting if there are over 15 data points for each player
if(nrow(Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],])>15){
#calculating how strong of a non linear trend there is using a gam
df <- summary(gam(Running.Imbalance~s(Days.Since.Start),
data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],]))$edf
#calculating how strong of a linear trend there is using kendall correlation
Kendall_Cor <- cor(x=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],]$Days.Since.Start, y=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],]$Running.Imbalance, method="kendall")
trends[trends$ID == all_IDs[i],2] = Kendall_Cor
if(all_IDs[i] %in% injured_IDs){
p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],],
aes(Date, Running.Imbalance)) +
geom_point(color="skyblue") +
geom_line(color="skyblue") +
labs(subtitle=Kendall_Cor)
}
if(!(all_IDs[i] %in% injured_IDs)){
p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],],
aes(Date, Running.Imbalance)) +
geom_point() +
geom_line() +
labs(subtitle=Kendall_Cor)
}
print(p)
}
}
#plotted only from January 1, 2025 to May 1, 2025
trends <- trends %>%
mutate(injured = ifelse(ID %in% injured_IDs,1,0))
ggplot(trends[trends$KC>0 & trends$injured==1,], aes(KC)) +
geom_histogram()
ggplot(trends[trends$KC>0 & trends$injured==0,], aes(KC)) +
geom_histogram()
remove(i, p, df, Kendall_Cor)
Based on the analysis below, we can see that by solely using variance in running imbalance from the time of the injury to the end of the predicted return-to-play range is not a very strong predictor for whether or not it will take longer for a player to recover or not. In this analysis, we used the running imbalance in the time frame starting with the injury date to the end of the prognosis time frame. With these specific running imbalance values, we calculated the variance in running imbalance and whether or not the athlete returned to play within the predicted time frame or not. This analysis found that when using the variance in these time frames as the only predictor in a logistic regression model, the slope coefficient associated with the variance was statistically significant at the \(\alpha = 0.01\) significance level. With that though, the cross validated accuracy of the model was only around 0.6 suggesting that it wasn’t super strong in practice. In order to understand the impact that variance in running imbalance had on whether or not a player returned in the predicted time frame or not, we performed a bootstrap. We separated the observations in which players did and did not return in the predicted time frame into two different data sets. We then sampled from each of these two data sets and calculated the average variance for each sample. This was repeated 5000 times. This gave us an estimate of the average variance in running imbalance for players who returned within the predicted time frame and those who did not. This bootstrap revealed that players who did not return within the predicted time frame had a variance greater by roughly 1.2 during their time of recovery than those who returned on time. This tells us that while variance in running imbalance is not directly strong enough to predict whether or not an athlete will return within the predicted time frame, it can be used to supplement prognosis or make adjustments to the prognosis during the time of recovery of a HSI.
#Calculating date back to play in incident report data set
Incident_Report_clean <- Incident_Report_clean %>%
filter(!is.na(Injury.Prognosis))%>%
#calculating how long predicted time loss is based on prognosis
#beginning of predicted range of return
mutate(Expected.Start.Return = as.Date(ifelse(Injury.Prognosis=="No Expected Time Loss",
Date.of.Injury,
ifelse(Injury.Prognosis=="Less than 1 Week",
Date.of.Injury,
ifelse(Injury.Prognosis=="1-4 Weeks",
Date.of.Injury+days(7),
Date.of.Injury+days(28))))),
#end of predicted range of return
Expected.End.Return = as.Date(ifelse(Injury.Prognosis=="No Expected Time Loss",
Date.of.Injury,
ifelse(Injury.Prognosis=="Less than 1 Week",
Date.of.Injury+days(7),
ifelse(Injury.Prognosis=="1-4 Weeks",
Date.of.Injury+days(28),
Date.of.Injury+days(56)))))) %>%
group_by(anon_id, Date.of.Injury) %>%
#calculating actual date cleared to return
mutate(Actual.Return = Date.of.Injury+days(sum(na.omit(Days.in.Status)))) %>%
ungroup()
for(i in 1:22){
#looking at mean running imbalance per week before and after injury for each injured player
p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == injured_IDs[i],], aes(Date, Running.Imbalance)) +
geom_line(linewidth=0.1) +
geom_point() +
#Marking when injury occurred with red line
geom_vline(xintercept=Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury, color="red") +
#Marking actual return date with solid green line
geom_vline(xintercept=Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Actual.Return, color="green", linetype=1) +
#Marking beginning of predicted return to play range with purple dotted line
geom_vline(xintercept=Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Expected.Start.Return, color="purple", linetype=3) +
#Marking end of predicted return to play range with purple dotted line
geom_vline(xintercept=Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Expected.End.Return, color="purple", linetype=3) +
labs(title=injured_IDs[i])
print(p)
}
#looking at running imbalance of only injured players
Injured_Historical_Running <- Historical_Running_clean %>%
filter(anon_id %in% injured_IDs)
#Making binary column if date was in time range of injury prognosis
Injured_Historical_Running$Date.in.Range <- rep(0, 1658)
#go through all of the injured players in the data set
for(i in 1:22){
#get the dates each player was injured
injury_dates <- unique(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)
#go through dates of injury for each player
for(j in 1:length(injury_dates)){
if(injured_IDs[i] == "ID_50"){ #ID_50 does not have enough running imbalance data
break
}
#get the expected date of return for that instance of injury
expected_return <- as.Date(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i] & Incident_Report_clean$Date.of.Injury==injury_dates[j],]$Expected.End.Return[1])
#if the date in running imbalance is between day of injury and last day of prediction, set as 1
Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] & Injured_Historical_Running$Date >= injury_dates[j] & Injured_Historical_Running$Date <= expected_return,]$Date.in.Range <- 1
}
}
#making a column for the variance in running imbalance for each injury instance range
Injured_Historical_Running <- Injured_Historical_Running %>%
filter(Date.in.Range == 1) %>%
group_by(anon_id, Injury.Count) %>%
mutate(Injury.Variability = var(Running.Imbalance)) %>%
ungroup()
#adding injury and running data sets together
Injured_Data <- left_join(Injured_Historical_Running, Incident_Report_clean, by=c("anon_id", "Injury.Count"), relationship = "many-to-many") %>%
#making new column if player returned in predicted time frame
mutate(Return.in.Range = ifelse(Actual.Return>=Expected.Start.Return & Actual.Return <= Expected.End.Return, 1, 0)) %>%
#removing rows that are missing important data
filter(!is.na(Injury.Variability),
!is.na(Return.in.Range))
set.seed(1000)
#making a 75% to 25% training to testing split
rows <- sample(1:nrow(Injured_Data), size=(nrow(Injured_Data)*0.75), replace=FALSE)
Injured_train <- Injured_Data[rows,]
Injured_test <- Injured_Data[-rows,]
#building logistic regression model from training data
return_to_play_model <- glm(Return.in.Range~Injury.Variability, data=Injured_train, family="binomial")
#looking at coefficients and p-values
summary(return_to_play_model)
#making predictions on testing data based off of the model built
Injured_test <- Injured_test %>%
mutate(Prediction = ifelse(predict(return_to_play_model, newdata=Injured_test, type="response")>0.5, 1, 0))
#calculating CER
Injured_test %>%
summarize(CER = mean(Prediction != Return.in.Range))
#making model with all of the data
return_to_play_cv <- glm(Return.in.Range~Injury.Variability, data=Injured_Data, family="binomial")
#making cost function for CER
cost <- function(obs, pred){
mean((pred <= 0.5) & obs==1 | (pred > 0.5) & obs==0)
}
set.seed(1000)
#cross validating with K=10
ten_cv <- cv.glm(data=Injured_Data,glmfit=return_to_play_cv,cost,K=10)
#extract average error
ten_cv$delta[1]
#taking only players who recovered in predicted time frame
In_Range <- Injured_Data %>%
filter(Return.in.Range == 1,
!is.na(Injury.Variability))
#taking only players who did not recover in the predicted time frame
Out_Range <- Injured_Data %>%
filter(Return.in.Range == 0,
!is.na(Injury.Variability))
#making data frame to hold values
Range_Variances <- data.frame(in.avg.var = rep(NA, 5000),
out.avg.var = rep(NA, 5000),
diff.avg.var = rep(NA, 5000))
#bootstrapping 5000 times
for(i in 1:5000){
set.seed(i)
#sample from players who recovered in predicted range
in_sample <- sample_n(In_Range, size=308, replace=TRUE)
#sample fro players who did not recover in predicted range
out_sample <- sample_n(Out_Range, size=477, replace=TRUE)
#calculating variances of each sample and storing in data frame
Range_Variances[i,1] <- mean(in_sample$Injury.Variability)
Range_Variances[i,2] <- mean(out_sample$Injury.Variability)
Range_Variances[i,3] <- Range_Variances[i,2] - Range_Variances[i,1] #diff in variances
}
#plotting differences in average variance
ggplot(data=Range_Variances, aes(diff.avg.var)) +
geom_histogram() +
#adding in 90% CI (does not include 0)
geom_vline(xintercept = quantile(Range_Variances$diff.avg.var, 0.05), color ="#CFB87C") +
geom_vline(xintercept = quantile(Range_Variances$diff.avg.var, 0.95), color ="#CFB87C")
#plotting average variances of difference groups
ggplot(data=Range_Variances) +
#players who recovered in predicted time frame
geom_histogram(aes(in.avg.var), alpha = 0.75) +
#players who did not recover in the predicted time frame
geom_histogram(aes(out.avg.var), alpha = 0.75, fill ="#CFB87C") +
labs(x="Average Variance")
remove(p,i,j,rows,ten_cv, cost, Injured_Data, Injured_Historical_Running, Injured_test, Injured_train, Range_Variances, return_to_play_cv, return_to_play_model, expected_return, injury_dates, In_Range, Out_Range, in_sample, out_sample)